In [2]:
!pip install --quiet boto3 pandas streamlit openai python-dotenv langfuse pycaret numpy scikit-learn

# PIPELINE

📌 Importy

In [3]:
from dotenv import load_dotenv
import os
import pandas as pd
import boto3
from io import BytesIO
from sklearn.preprocessing import LabelEncoder
from pycaret.regression import setup, compare_models, tune_model, finalize_model, save_model

load_dotenv()

True

📌 Połączenie z DigitalOcean Spaces

In [4]:
s3 = boto3.client(
    's3',
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
    endpoint_url=os.getenv('AWS_ENDPOINT_URL')
    )


In [6]:
BUCKET_NAME = "maraton_2023"

📌 Wysłanie pliku do DigitalOcean Spaces

In [7]:
s3 = boto3.client(
    's3',
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
    endpoint_url=os.getenv('AWS_ENDPOINT_URL')
)

file_path = 'maraton_2023.csv'

object_name = 'maraton_2023.csv'

with open(file_path, 'rb') as f:
    s3.upload_fileobj(f, BUCKET_NAME, object_name)

print("Plik został wysłany.")

Plik został wysłany.


📌 Wczytanie CSV jako DataFrame

In [8]:
response = s3.get_object(Bucket=BUCKET_NAME, Key=object_name)
csv_bytes = response['Body'].read()

df = pd.read_csv(BytesIO(csv_bytes), sep=';', encoding='utf-8')

print("Wczytano dane")
display(df.head())

Wczytano dane


Unnamed: 0,Miejsce,Numer startowy,Imię,Nazwisko,Miasto,Kraj,Drużyna,Płeć,Płeć Miejsce,Kategoria wiekowa,...,10 km Tempo,15 km Czas,15 km Miejsce Open,15 km Tempo,20 km Czas,20 km Miejsce Open,20 km Tempo,Tempo Stabilność,Czas,Tempo
0,1.0,1787,TOMASZ,GRYCKO,,POL,UKS BLIZA WŁADYSŁAWOWO,M,1.0,M30,...,2.926667,00:44:47,1.0,3.106667,01:01:43,1.0,3.386667,0.0314,01:04:59,3.080509
1,2.0,3,ARKADIUSZ,GARDZIELEWSKI,WROCŁAW,POL,ARKADIUSZGARDZIELEWSKI.PL,M,2.0,M30,...,2.983333,00:45:26,2.0,3.143333,01:03:08,2.0,3.54,0.038,01:06:23,3.146875
2,3.0,3832,KRZYSZTOF,HADAS,POZNAŃ,POL,,M,3.0,M20,...,3.123333,00:47:34,3.0,3.236667,01:05:09,3.0,3.516667,0.024067,01:08:24,3.242475
3,4.0,416,DAMIAN,DYDUCH,KĘPNO,POL,AZS POLITECHNIKA OPOLSKA,M,4.0,M30,...,3.196667,00:48:49,5.0,3.33,01:06:54,4.0,3.616667,0.025467,01:10:16,3.330963
4,5.0,8476,KAMIL,MAŃKOWSKI,MIRKÓW,POL,PARKRUN WROCŁAW,M,5.0,M20,...,3.276667,00:49:31,7.0,3.386667,01:07:27,5.0,3.586667,0.023,01:10:27,3.339654


📌 Czyszczenie danych

In [9]:
df.columns = df.columns.str.strip()
df = df[['Płeć', 'Rocznik', '5 km Tempo', 'Czas']].dropna()

📌 Pomocnicza funkcja konwertująca tempo/czas na sekundy

In [10]:
def convert_time_to_seconds(time):
    if pd.isnull(time) or time in ['DNS', 'DNF']:
        return None
    parts = str(time).split(':')
    return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])

In [11]:
def minutes_to_seconds(minutes):
    try:
        return float(minutes) * 60
    except:
        return None

📌 Przekształcenie czasu i tempa na sekundy

In [12]:
df['tempo_sec'] = df['5 km Tempo'].apply(minutes_to_seconds)
df['czas_sec'] = df['Czas'].apply(convert_time_to_seconds)

📌 Interpolacja brakujących wartości

In [13]:
df['tempo_sec'] = df['tempo_sec'].interpolate()
df['czas_sec'] = df['czas_sec'].interpolate()

📌 Obliczenia Wieku

In [14]:
df['wiek'] = 2023 - df['Rocznik']

📌 Encoding Płci

In [15]:
le = LabelEncoder()
df['płeć_encoded'] = le.fit_transform(df['Płeć'])

📌 Finalny zbiór danych do modelu

In [16]:
df_model = df[['płeć_encoded', 'wiek', 'tempo_sec', 'czas_sec']].dropna()

📌 Inicjalizacja środowiska PyCaret

In [39]:
exp = setup(
    data=df_model,
    target='czas_sec',
    session_id=123,
    verbose=False,
    fold=5
)

📌 Porównanie modeli

In [42]:
best_model = compare_models(sort='MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,285.3418,172684.4619,414.1519,0.8822,0.0511,0.0371,0.012
gbr,Gradient Boosting Regressor,286.2259,157074.2499,396.2246,0.8927,0.0502,0.0375,0.04
omp,Orthogonal Matching Pursuit,289.6171,171063.1387,412.0511,0.8834,0.0509,0.0379,0.008
en,Elastic Net,289.7193,171155.6548,412.1574,0.8833,0.0509,0.0379,0.304
lasso,Lasso Regression,289.7314,171166.8574,412.1673,0.8833,0.0509,0.0379,0.364
llar,Lasso Least Angle Regression,289.7314,171166.89,412.1673,0.8833,0.0509,0.0379,0.006
br,Bayesian Ridge,289.7406,171173.9603,412.1764,0.8833,0.0509,0.0379,0.008
ridge,Ridge Regression,289.7734,171197.1594,412.2015,0.8833,0.0509,0.0379,0.364
lr,Linear Regression,289.7735,171197.2099,412.2016,0.8833,0.0509,0.0379,0.47
lar,Least Angle Regression,289.7735,171197.2357,412.2016,0.8833,0.0509,0.0379,0.008


  .applymap(highlight_cols, subset=["TT (Sec)"])


📌 Tuning najlepszego modelu

In [43]:
tuned_model = tune_model(best_model, n_iter=50, optimize='MAE')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,267.0011,145025.8844,380.8226,0.9029,0.0481,0.035
1,286.7604,162119.613,402.6408,0.8854,0.0507,0.0372
2,282.0196,171910.2644,414.6206,0.8846,0.0523,0.0365
3,302.1345,229935.6702,479.5161,0.8463,0.0551,0.039
4,287.7855,159475.2383,399.3435,0.8884,0.0501,0.0374
Mean,285.1402,173693.334,415.3887,0.8815,0.0512,0.037
Std,11.2889,29408.5282,33.8458,0.0188,0.0024,0.0013


Fitting 5 folds for each of 50 candidates, totalling 250 fits


📌 Finalizacja (zapisuje model z całego datasetu)

In [44]:
final_model = finalize_model(tuned_model)

📌 Zapis i upload

In [45]:
save_model(final_model, 'model_pycaret')
with open('model_pycaret.pkl', 'rb') as f:
    s3.upload_fileobj(f, BUCKET_NAME, 'model_pycaret.pkl')

Transformation Pipeline and Model Successfully Saved
