In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

from pylab import rcParams
rcParams['figure.figsize'] = 12,8

In [2]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import os
from glob import glob

import yaml
import itertools


import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [3]:
import warnings
warnings.simplefilter(action="ignore")

In [4]:
def get_paths(company, period, models_list):
    
    '''
    Find paths to result (dir /wf_result/) files of experiments
    '''

    uuids = []
    model_names = []
    train_start_or_duration = []
    hyperparameters = []
    features = []
    
    file_paths_to_pred = []
    file_paths_to_info = []

    for model in models_list:
        basic_path = '/diploma_info/datalake/'
        path_to_res = f'{company}/{period}/{model}'

        file_paths_to_pred += glob(max(glob(basic_path+f'forecast/regression/{path_to_res}/research_task_*'), key=os.path.getctime) + f'/{model}_*/forecast_*.csv')
        file_paths_to_info += glob(max(glob(basic_path+f'wf_result/regression/{path_to_res}/research_task_*'), key=os.path.getctime) + f'/{model}_*.yaml')
            
    print(len(file_paths_to_info), len(file_paths_to_pred))


    for file in file_paths_to_info: 
        with open(file, 'r') as f:
            res = yaml.safe_load(f)
        uuids.append(res['unique_uuid'])
        model_names.append(res['model_name'])
        hyperparameters.append(res['model_hyperparameters'])
        features.append(res['train_features'])
        if 'duration_training_history' in res:
            train_start_or_duration.append(res['duration_training_history'])
        else:
            train_start_or_duration.append(res['train_start'])
            

    shorten_uuids = [n.split('-')[-2] for n in uuids]

    model_parameters = list(zip(uuids, shorten_uuids, model_names, train_start_or_duration, hyperparameters, features))           


    return file_paths_to_pred, file_paths_to_info, model_parameters

In [5]:
def load_preds(company, period, test_start, test_end, file_paths_to_pred):
    
    '''
    Makes dataframe with true values and predictions
    
    '''
    
    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d')
    path_to_full_set = f'/diploma_info/datalake/processed_data/{company}_{period}.csv'

    full_set = pd.read_csv(
        path_to_full_set,
        parse_dates=['date'],
        index_col='date', 
        date_parser=dateparse
    )
    target = full_set.loc[:, ['close']]
    
    
    df_preds = target.copy()
    
    for num_exp, exp_paths in enumerate(file_paths_to_pred):
    
        if len(exp_paths.split('_')) == 7:
            exp_name = exp_paths.split('.')[0].split('_')[-1]+'_'+exp_paths.split('-')[-3]+'_'+exp_paths.split('_')[-2]
        elif len(exp_paths.split('_')) == 10:
            exp_name = "_".join(exp_paths.split('.')[0].split('_')[-2:])+'_'+exp_paths.split('-')[-3]+'_'+exp_paths.split('_')[-3]
            
#         print(exp_name)
        
        pred = pd.read_csv(
            exp_paths,
            parse_dates=['date_time'],
            index_col='date_time', 
            date_parser=dateparse
        )
        pred.index.name = 'date'
        pred.columns = [f'{exp_name}']
        

        for date in pred.index:
            df_preds.loc[date, f'{exp_name}'] = pred.loc[date, f'{exp_name}']
                   
    df_preds = df_preds.loc[test_start:test_end]
#     .dropna()
#     print(df_preds)
    
    return df_preds, df_preds.shape[0], df_preds.shape[1]

In [6]:
def get_errors_dataframe(df_preds, model_parameters):
    
    '''
    Counts accuracy value of each tested model on each day
    
    '''    

    uuids = []
    model_names = []
    train_start_or_duration = []
    hyperparameters = []
    features = []
    
    models = df_preds.columns[1:]

    shorten_best_model_uuid = [n.split('_')[-2] for n in models]
    
    for m in shorten_best_model_uuid:  
        for n in model_parameters:
            if m==n[1]:
                uuids.append(n[0])
                model_names.append(n[2])
                train_start_or_duration.append(n[-3])
                hyperparameters.append(n[-2])
                features.append(n[-1])
    
    stats_df = pd.DataFrame()
    
    for num, exp_name in enumerate(models):
        
        pred = df_preds[exp_name].dropna()
        orig = df_preds.loc[pred.index, 'close'].values

        abs_err_df = pd.DataFrame(np.around(orig-pred.values, 2))
        rel_err_df = pd.DataFrame(np.around(abs(pred.values / orig - 1) * 100, 2))
        
        len_df_pred = len(pred.values)
        n_less_5 = rel_err_df[rel_err_df.iloc[:, 0] < 5].shape[0]
        
        stats_df.loc[exp_name, 'forecasted_day'] = int(exp_name[-1])
        stats_df.loc[exp_name, 'n_days_forecasted'] = len_df_pred
        stats_df.loc[exp_name, 'n_days_err_less_5'] = n_less_5
        stats_df.loc[exp_name, '%_days_err_less_5'] = round((n_less_5 / len_df_pred) * 100, 2)
        stats_df.loc[exp_name, 'accuracy'] = np.around(100 - rel_err_df.mean().values, 4)
        
        stats_df.loc[exp_name, 'mean_abs_err'] = np.around(abs_err_df.mean().values, 4)
        stats_df.loc[exp_name, 'mean_rel_err'] = np.around(rel_err_df.mean().values, 4)

        stats_df.loc[exp_name, 'median_abs_err'] = np.around(abs_err_df.median().values, 4)
        stats_df.loc[exp_name, 'median_rel_err'] = np.around(rel_err_df.median().values, 4)

        stats_df.loc[exp_name, 'q25_abs_err'] = np.around(abs_err_df.quantile(0.25).values, 4)
        stats_df.loc[exp_name, 'q25_rel_err'] = np.around(rel_err_df.quantile(0.25).values, 4)

        stats_df.loc[exp_name, 'q75_abs_err'] = np.around(abs_err_df.quantile(0.75).values, 4)
        stats_df.loc[exp_name, 'q75_rel_err'] = np.around(rel_err_df.quantile(0.75).values, 4)

        stats_df.loc[exp_name, 'max_abs_err'] = np.around(abs_err_df.max().values, 4)
        stats_df.loc[exp_name, 'max_rel_err'] = np.around(rel_err_df.max().values, 4)
        
        
        stats_df.loc[exp_name, 'model'] = model_names[num]
        stats_df.loc[exp_name, 'train_start/duration'] = train_start_or_duration[num]
        stats_df.loc[exp_name, 'features'] = f"{features[num]}"
        stats_df.loc[exp_name, 'hyperparameters'] = f"{hyperparameters[num]}"
        stats_df.loc[exp_name, 'experiment_uuid'] = uuids[num]
        
    
    
    return stats_df

In [7]:
companies = ["AMAZON", "APPLE", "GOOGLE", "META", "NETFLIX"]
time_period = ["daily"]
# time_period = ["daily", "weekly", "monthly"]

models_list = ['xgboost', 'lightgbm', 'random_forest', 'linear_regression']

test_start = '2024-01-01'
test_end = '2024-02-08'


for company, period in list(itertools.product(companies, time_period)):
    
    print(company, period)
    
#     try:
        
    file_paths_to_pred, file_paths_to_info, model_parameters = get_paths(company.lower(), period, models_list)

    df_preds, n_rows, n_cols = load_preds(company, period, test_start, test_end, file_paths_to_pred)

    stats = get_errors_dataframe(df_preds, model_parameters)

    stats.to_csv(os.path.join(f'/diploma_info/datalake/statistical_result/regression', 
                              f'statistical_result_{company.lower()}_{period}_{datetime.now().strftime("%Y%m%d")}.csv'))
    
#     except ValueError as e:
#         print(e)
#         continue

AMAZON daily
36 180
APPLE daily
36 180
GOOGLE daily
36 180
META daily
36 180
NETFLIX daily
36 180
