In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

from pylab import rcParams
rcParams['figure.figsize'] = 12,8

In [2]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import os
from glob import glob

import yaml
import itertools


import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [3]:
import warnings
warnings.simplefilter(action="ignore")

In [4]:
def get_paths(company, period, models_list):
    
    '''
    Find paths to result (dir /wf_result/) files of experiments
    '''

    uuids = []
    model_names = []
    train_start_or_duration = []
    hyperparameters = []
    features = []
    
    file_paths_to_pred = []
    file_paths_to_info = []

    for model in models_list:
        basic_path = '/diploma_info/datalake/'
        path_to_res = f'{company}/{period}/{model}'

        file_paths_to_pred += glob(max(glob(basic_path+f'forecast/regression/{path_to_res}/research_task_*'), key=os.path.getctime) + f'/{model}_*/forecast_*.csv')
        file_paths_to_info += glob(max(glob(basic_path+f'wf_result/regression/{path_to_res}/research_task_*'), key=os.path.getctime) + f'/{model}_*.yaml')
            
    print(len(file_paths_to_info), len(file_paths_to_pred))


    for file in file_paths_to_info: 
        with open(file, 'r') as f:
            res = yaml.safe_load(f)
        uuids.append(res['unique_uuid'])
        model_names.append(res['model_name'])
        hyperparameters.append(res['model_hyperparameters'])
        features.append(res['train_features'])
        if 'duration_training_history' in res:
            train_start_or_duration.append(res['duration_training_history'])
        else:
            train_start_or_duration.append(res['train_start'])
            

    shorten_uuids = [n.split('-')[-2] for n in uuids]

    model_parameters = list(zip(uuids, shorten_uuids, model_names, train_start_or_duration, hyperparameters, features))           


    return file_paths_to_pred, file_paths_to_info, model_parameters

In [5]:
def load_preds(company, period, test_start, test_end, file_paths_to_pred):
    
    '''
    Makes dataframe with true values and predictions
    
    '''
    
    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d')
    path_to_full_set = f'/diploma_info/datalake/processed_data/{company}_{period}.csv'

    full_set = pd.read_csv(
        path_to_full_set,
        parse_dates=['date'],
        index_col='date', 
        date_parser=dateparse
    )
    target = full_set.loc[:, ['close']]
    
    
    df_preds = target.copy()
    
    for num_exp, exp_paths in enumerate(file_paths_to_pred):
    
        if len(exp_paths.split('_')) == 7:
            exp_name = exp_paths.split('.')[0].split('_')[-1]+'_'+exp_paths.split('-')[-3]+'_'+exp_paths.split('_')[-2]
        elif len(exp_paths.split('_')) == 10:
            exp_name = "_".join(exp_paths.split('.')[0].split('_')[-2:])+'_'+exp_paths.split('-')[-3]+'_'+exp_paths.split('_')[-3]
            
#         print(exp_name)
        
        pred = pd.read_csv(
            exp_paths,
            parse_dates=['date_time'],
            index_col='date_time', 
            date_parser=dateparse
        )
        pred.index.name = 'date'
        pred.columns = [f'{exp_name}']
        

        for date in pred.index:
            df_preds.loc[date, f'{exp_name}'] = pred.loc[date, f'{exp_name}']
                   
    df_preds = df_preds.loc[test_start:test_end]
#     .dropna()
#     print(df_preds)
    
    return df_preds, df_preds.shape[0], df_preds.shape[1]

In [6]:
def get_errors_dataframe(df_preds, model_parameters):
    
    '''
    Counts accuracy value of each tested model on each day
    
    '''    

    uuids = []
    model_names = []
    train_start_or_duration = []
    hyperparameters = []
    features = []
    
    models = df_preds.columns[1:]

    shorten_best_model_uuid = [n.split('_')[-2] for n in models]
    
    for m in shorten_best_model_uuid:  
        for n in model_parameters:
            if m==n[1]:
                uuids.append(n[0])
                model_names.append(n[2])
                train_start_or_duration.append(n[-3])
                hyperparameters.append(n[-2])
                features.append(n[-1])
    
    stats_df = pd.DataFrame()
    
    for num, exp_name in enumerate(models):
        
        pred = df_preds[exp_name].dropna()
        orig = df_preds.loc[pred.index, 'close'].values

        abs_err_df = pd.DataFrame(np.around(orig-pred.values, 2))
        rel_err_df = pd.DataFrame(np.around(abs(pred.values / orig - 1) * 100, 2))
        
        len_df_pred = len(pred.values)
        n_less_5 = rel_err_df[rel_err_df.iloc[:, 0] < 5].shape[0]
        
        stats_df.loc[exp_name, 'forecasted_day'] = int(exp_name[-1])
        stats_df.loc[exp_name, 'n_days_forecasted'] = len_df_pred
        stats_df.loc[exp_name, 'n_days_err_less_5'] = n_less_5
        stats_df.loc[exp_name, '%_days_err_less_5'] = round((n_less_5 / len_df_pred) * 100, 2)
        stats_df.loc[exp_name, 'accuracy'] = np.around(100 - rel_err_df.mean().values, 4)
        
        stats_df.loc[exp_name, 'mean_abs_err'] = np.around(abs_err_df.mean().values, 4)
        stats_df.loc[exp_name, 'mean_rel_err'] = np.around(rel_err_df.mean().values, 4)

        stats_df.loc[exp_name, 'median_abs_err'] = np.around(abs_err_df.median().values, 4)
        stats_df.loc[exp_name, 'median_rel_err'] = np.around(rel_err_df.median().values, 4)

        stats_df.loc[exp_name, 'q25_abs_err'] = np.around(abs_err_df.quantile(0.25).values, 4)
        stats_df.loc[exp_name, 'q25_rel_err'] = np.around(rel_err_df.quantile(0.25).values, 4)

        stats_df.loc[exp_name, 'q75_abs_err'] = np.around(abs_err_df.quantile(0.75).values, 4)
        stats_df.loc[exp_name, 'q75_rel_err'] = np.around(rel_err_df.quantile(0.75).values, 4)

        stats_df.loc[exp_name, 'max_abs_err'] = np.around(abs_err_df.max().values, 4)
        stats_df.loc[exp_name, 'max_rel_err'] = np.around(rel_err_df.max().values, 4)
        
        
        stats_df.loc[exp_name, 'model'] = model_names[num]
        stats_df.loc[exp_name, 'train_start/duration'] = train_start_or_duration[num]
        stats_df.loc[exp_name, 'features'] = f"{features[num]}"
        stats_df.loc[exp_name, 'hyperparameters'] = f"{hyperparameters[num]}"
        stats_df.loc[exp_name, 'experiment_uuid'] = uuids[num]
        
    
    
    return stats_df

In [7]:
companies = ["AMAZON", "APPLE", "GOOGLE", "META", "NETFLIX"]
time_period = ["daily"]
# time_period = ["daily", "weekly", "monthly"]

models_list = ['xgboost', 'lightgbm', 'random_forest', 'linear_regression']

test_start = '2024-01-01'
test_end = '2024-02-08'


for company, period in list(itertools.product(companies, time_period)):
    
    print(company, period)
    
#     try:
        
    file_paths_to_pred, file_paths_to_info, model_parameters = get_paths(company.lower(), period, models_list)

    df_preds, n_rows, n_cols = load_preds(company, period, test_start, test_end, file_paths_to_pred)

    stats = get_errors_dataframe(df_preds, model_parameters)

    stats.to_csv(os.path.join(f'/diploma_info/datalake/statistical_result/regression', 
                              f'statistical_result_{company.lower()}_{period}_{datetime.now().strftime("%Y%m%d")}.csv'))
    
#     except ValueError as e:
#         print(e)
#         continue

AMAZON daily
30 150
       0
0   1.16
1   0.89
2   1.78
3   2.06
4   4.59
5   2.79
6   3.87
7   2.26
8   0.79
9   0.97
10  0.02
11  1.19
12  2.36
13  0.89
14  1.68
15  1.65
16  2.20
17  1.93
18  2.53
19  0.82
20  1.28
21  3.69
22  8.61
23  1.36
24  1.06
25  1.86
26  1.46
        0
0    2.16
1    1.78
2    1.32
3    1.73
4    6.02
5    4.28
6    4.77
7    1.91
8    0.97
9    0.02
10   1.19
11   2.36
12   2.01
13   1.68
14   2.21
15   2.20
16   3.04
17   5.94
18   1.14
19   3.29
20   1.32
21  10.71
22  10.94
23   0.68
24   1.86
25   1.46
        0
0    4.91
1    1.32
2    1.73
3    0.20
4    7.47
5    5.17
6    4.42
7    0.97
8    0.02
9    1.19
10   2.36
11   2.01
12   2.79
13   2.21
14   2.76
15   3.04
16   5.94
17   4.61
18   1.28
19   0.65
20   8.51
21  10.94
22  10.33
23   1.48
24   1.46
        0
0    4.43
1    4.59
2    0.20
3    1.34
4    8.33
5    4.83
6    0.97
7    0.02
8    1.19
9    2.36
10   2.01
11   2.79
12   3.31
13   2.76
14   3.59
15   3.23
16   4.61
17   2.27
18   1.3

        0
0    2.64
1    8.18
2    3.83
3    5.16
4   10.57
5    0.21
6    0.96
7    4.49
8    5.04
9    2.77
10   3.50
11   3.70
12   5.17
13   4.67
14   2.74
15   2.58
16   1.93
17   6.21
18   8.49
19   9.16
20   5.21
21  10.47
22  11.13
       0
0   1.18
1   0.65
2   2.03
3   2.30
4   4.58
5   2.56
6   3.86
7   2.24
8   1.20
9   0.85
10  0.01
11  1.17
12  2.35
13  1.30
14  2.08
15  1.67
16  2.22
17  1.95
18  2.54
19  0.43
20  1.26
21  4.09
22  8.26
23  1.38
24  1.05
25  1.86
26  1.46
        0
0    2.17
1    2.03
2    1.56
3    1.74
4    6.01
5    4.05
6    4.75
7    1.89
8    0.96
9    0.09
10   1.17
11   2.35
12   1.99
13   2.08
14   2.61
15   2.22
16   3.06
17   5.93
18   1.16
19   2.88
20   1.33
21  11.08
22  10.93
23   0.71
24   1.86
25   1.46
        0
0    4.93
1    1.56
2    1.74
3    0.22
4    7.45
5    4.95
6    4.41
7    0.96
8    0.01
9    1.07
10   2.35
11   1.99
12   2.77
13   2.61
14   3.16
15   3.06
16   5.93
17   4.59
18   1.26
19   0.25
20   8.53
21  10.93
22  10.3

        0
0   99.33
1   99.33
2   99.31
3   99.31
4   99.33
5   99.34
6   99.35
7   99.36
8   99.35
9   99.35
10  99.34
11  99.35
12  99.36
13  99.35
14  99.36
15  99.36
16  99.37
17  99.37
18  99.38
19  99.37
20  99.36
21  99.37
22  99.42
23  99.41
24  99.41
25  99.41
26  99.41
        0
0   99.33
1   99.31
2   99.31
3   99.33
4   99.34
5   99.35
6   99.36
7   99.35
8   99.35
9   99.34
10  99.35
11  99.36
12  99.35
13  99.36
14  99.36
15  99.37
16  99.37
17  99.38
18  99.37
19  99.36
20  99.37
21  99.42
22  99.41
23  99.41
24  99.41
25  99.41
        0
0   99.31
1   99.31
2   99.33
3   99.34
4   99.35
5   99.36
6   99.35
7   99.35
8   99.34
9   99.35
10  99.36
11  99.35
12  99.36
13  99.36
14  99.37
15  99.37
16  99.38
17  99.37
18  99.36
19  99.37
20  99.42
21  99.41
22  99.41
23  99.41
24  99.41
        0
0   99.31
1   99.33
2   99.34
3   99.35
4   99.36
5   99.35
6   99.35
7   99.34
8   99.35
9   99.36
10  99.35
11  99.36
12  99.36
13  99.37
14  99.37
15  99.38
16  99.37
17  99.36


       0
0   0.85
1   0.28
2   0.52
3   0.58
4   1.76
5   2.22
6   1.30
7   0.35
8   0.24
9   0.04
10  0.32
11  0.62
12  1.13
13  1.12
14  0.91
15  0.43
16  0.72
17  0.63
18  1.44
19  0.82
20  0.96
21  2.31
22  1.92
23  0.62
24  0.15
25  0.86
26  0.38
       0
0   2.29
1   4.01
2   1.76
3   1.72
4   2.70
5   3.13
6   0.66
7   1.77
8   0.04
9   1.32
10  0.72
11  1.25
12  1.13
13  1.45
14  1.02
15  0.69
16  1.56
17  1.44
18  0.28
19  4.10
20  1.34
21  8.77
22  0.38
23  2.62
24  1.65
25  0.40
       0
0   3.18
1   0.59
2   1.72
3   2.66
4   2.56
5   1.21
6   1.06
7   0.04
8   1.31
9   0.29
10  0.98
11  1.13
12  1.46
13  0.89
14  0.38
15  0.91
16  1.44
17  0.28
18  3.77
19  1.17
20  8.96
21  0.38
22  2.87
23  0.06
24  1.37
       0
0   0.26
1   1.65
2   2.66
3   2.52
4   1.10
5   1.04
6   0.04
7   1.31
8   0.29
9   0.66
10  0.71
11  1.46
12  0.88
13  0.40
14  1.01
15  1.22
16  0.28
17  3.77
18  1.17
19  9.03
20  0.58
21  2.87
22  0.30
23  0.52
       0
0   2.33
1   2.14
2   2.52
3   1.07
4

KeyboardInterrupt: 