In [None]:
import boto3
from dotenv import load_dotenv
import os.path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.regression import setup, predict_model, compare_models, load_model, create_model, save_model, plot_model, finalize_model # type: ignore

In [None]:
MODELS_PATH = 'halfmarathon/models/'
DATA_PATH = 'halfmarathon/data/'
BUCKET_NAME = "nowy"

load_dotenv()

In [None]:
def convert_time_to_seconds(time):
    if pd.isnull(time) or time in ['DNS', 'DNF']:
        return None
    time = time.split(':')
    return int(time[0]) * 3600 + int(time[1]) * 60 + int(time[2])

def convert_seconds_to_time(seconds):
    seconds = int(seconds)
    min, sec = divmod(seconds, 60)
    hrs, min = divmod(min, 60)

    return f'{hrs:02d}:{min:02d}:{sec:02d}'

def download_models():
    models = []

    try:
        model_list = get_file_list(prefix=MODELS_PATH)
                
        for item in model_list:
            model = item['Key']
            if str(model).endswith('.pkl'):
                models.append(model)
    except:
        return None

    return models

def get_digital_ocean_client():
    return boto3.client('s3',)

def get_file_list(prefix):
    s3 = get_digital_ocean_client()
    response = s3.list_objects_v2(Bucket = BUCKET_NAME, Prefix = prefix)
    return response['Contents']

def download_file(file_full_name):
    s3 = get_digital_ocean_client()
    base_name = os.path.basename(file_full_name)
    s3.download_file(BUCKET_NAME, file_full_name, base_name)
    return base_name

def upload_file(file_name, prefix):
    s3 = get_digital_ocean_client()
    s3.upload_file(file_name, BUCKET_NAME, prefix + file_name)

### Pobierz listę plików csv

In [None]:
csv_list = get_file_list(DATA_PATH)

### Wczytaj dane do Data Frame

In [None]:
csv_names = []

for item in csv_list:
    f = item['Key']
    if str(f).endswith('.csv'):
        csv_names.append(f)

file1 = download_file(csv_names[0]) 
file2 = download_file(csv_names[1]) 

m2023_df = pd.read_csv(file1, sep=';')    
m2024_df = pd.read_csv(file2, sep=';')

In [None]:
m2023_df['Czas'] = m2023_df['Czas'].apply(convert_time_to_seconds)
m2023_df['5 km Czas'] = m2023_df['5 km Czas'].apply(convert_time_to_seconds)

m2024_df['Czas'] = m2024_df['Czas'].apply(convert_time_to_seconds)
m2024_df['5 km Czas'] = m2024_df['5 km Czas'].apply(convert_time_to_seconds)

m2023_df = m2023_df[['Miejsce', 'Nazwisko','Płeć', 'Kategoria wiekowa', '5 km Czas', 'Czas']]
m2024_df = m2024_df[['Miejsce', 'Nazwisko','Płeć', 'Kategoria wiekowa', '5 km Czas', 'Czas']]

In [None]:
m2023_df.isnull().sum()
m2023_df.sort_values(by='Miejsce', inplace=True)
m2023_df['5 km Czas'].fillna(method='bfill', inplace=True)

m2023_df.dropna(inplace=True)

# m2023_df.describe()

In [None]:
m2024_df.isnull().sum()
m2024_df.sort_values(by='Miejsce', inplace=True)
m2024_df['5 km Czas'].fillna(method='bfill', inplace=True)

m2024_df.dropna(inplace=True)


# m2024_df[(m2024_df['5 km Czas'].isnull()) & (m2024_df['Czas'].notnull())]

In [None]:
fig = plt.figure(figsize=(10,5))
plt.title("Półmaraton 2023")
sns.scatterplot(data=m2023_df, x="5 km Czas", y='Czas');

In [None]:
# # Odrzuć odstające dane

# Q1 = m2023_df["5 km Czas"].quantile(0.25)
# Q3 = m2023_df["5 km Czas"].quantile(0.75)
# IQR = Q3 - Q1

# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR

# m2023_df = m2023_df[~((m2023_df["5 km Czas"] < lower_bound) | (m2023_df["5 km Czas"] > upper_bound))]


In [None]:
fig = plt.figure(figsize=(10,5))
plt.title("Półmaraton 2024")
sns.scatterplot(data=m2024_df, x="5 km Czas", y='Czas');

### Uratujemy rekord gdzie '5 km Czas' == 0

In [None]:
idx = m2024_df[m2024_df['5 km Czas'] == 0].index[0]
m2024_df.loc[idx]
m2024_df.loc[idx, '5 km Czas'] = None
m2024_df['5 km Czas'].fillna(method='bfill', inplace=True)
m2024_df.loc[idx]

### Połącz dane obu półmaratonów

In [None]:
marathon_df = pd.concat([m2023_df, m2024_df], ignore_index=True)
marathon_df = marathon_df[['Płeć', 'Kategoria wiekowa', '5 km Czas', 'Czas']]
marathon_df.columns = ['sex', 'age_category', '5time', 'time']
marathon_df.isna().any()

In [None]:
fig = plt.figure(figsize=(10,5))
sns.scatterplot(data=marathon_df, x="5time", y='time');

In [None]:
# Odrzuć odstające dane

Q1 = marathon_df["5time"].quantile(0.25)
Q3 = marathon_df["5time"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

marathon_df = marathon_df[~((marathon_df["5time"] < lower_bound) | (marathon_df["5time"] > upper_bound))]

In [None]:
# Zaczynamy...

exp = setup(data=marathon_df, target='time', session_id=123, index=False)

In [None]:
model_mapping = 	{
		'GradientBoostingRegressor':'gbr', 
		'CatBoostRegressor' : 'catboost', 
		'LGBMRegressor' : 'lightgbm', 
		'ElasticNet' : 'en', 
		'LassoLars' : 'llar', 
		'OrthogonalMatchingPursuit' : 'omp', 
		'BayesianRidge' : 'br', 
		'Lasso': 'lasso', 
		'LinearRegression' : 'lr', 
		'Ridge' : 'ridge',
        'LassoLars' : 'llar',
	}

In [None]:
# Pierwsze 3 modele
N_SELECT = 3
best_models = exp.compare_models(n_select=N_SELECT)

In [None]:
model_names = [model.__class__.__name__ for model in best_models]
print(model_names)

In [None]:
# Mapuj wybrane modele
model_map_df = pd.DataFrame(model_names, columns=['name'])
model_map_df['abbrev'] = model_map_df['name'].map(model_mapping)
model_map_df['model'] = None
print(model_map_df)

In [None]:
# Trzy pierwsze modele są zapisywane lokalnie i w Digital Ocean Spaces
# Indeksy w nazwach modeli wskazują, który jest 1, 2 a który trzeci

for idx in range(N_SELECT):
    print(model_map_df['abbrev'][idx])
    current_model = create_model(model_map_df['abbrev'][idx], verbose=False)
    f_model = finalize_model(current_model)
    save_model(f_model, f"{model_map_df['abbrev'][idx]}_{idx}")
    upload_file(f"{model_map_df['abbrev'][idx]}_{idx}.pkl", MODELS_PATH)
    # plot_model(f_model, 'feature')

In [None]:
# Dane testowe
predict_run = pd.DataFrame([
    {
        'sex' : 'M',
        'age_category' : 'M30',
        '5time' : 1095
    }
])

In [None]:
loaded_model = load_model('model_lr_1')
plot_model(loaded_model, 'feature')
predict = predict_model(loaded_model, data = predict_run)
print(f"{convert_seconds_to_time(predict['prediction_label'])}, {int(predict['prediction_label'])}s")

In [None]:
predict

In [None]:
marathon_df[marathon_df['5time'] == 1095].head()

In [None]:
convert_seconds_to_time(4735)