In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

from pylab import rcParams
rcParams['figure.figsize'] = 12,8

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import re

import os
from glob import glob
from tqdm import tqdm

import yaml
from yaml import dump
import uuid
import itertools

In [3]:
import xgboost
from xgboost import XGBClassifier, XGBRegressor

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier

from lightgbm import LGBMClassifier, LGBMRegressor

In [4]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [5]:
import warnings
warnings.simplefilter(action="ignore")

## Initialize global variables

__here we will set all the variables used in both classification and regression problems__

In [20]:
date_parse = lambda dates: pd.to_datetime(dates)
    
companies = ["AMAZON", "APPLE", "GOOGLE", "META", "NETFLIX"]
time_period = ["daily"]

full_sets = {}

for company, period in itertools.product(companies, time_period):
    path = f"/diploma_info/datalake/processed_data/{company}_{period}.csv"

    full_sets[company.lower()] = pd.read_csv(
        path,
        parse_dates=["date"],
        date_parser=date_parse,
        index_col=["date"],
    )

train_end = full_sets[companies[0].lower()].iloc[-1].name
test_start = train_end +timedelta(days=1)
test_end = test_start + timedelta(days=1)

Timestamp('2024-05-08 00:00:00')

In [None]:
test_start = datetime(full_set.loc[test_start:].index[0].year, full_set.loc[test_start:].index[0].month, full_set.loc[test_start:].index[0].day)
train_end = datetime(full_set.loc[:train_end].index[-1].year, full_set.loc[:train_end].index[-1].month, full_set.loc[:train_end].index[-1].day)

In [None]:
def data(day, X_full_set, y_full_set, train_start, config, forecast_steps):

    X_train = X_full_set.loc[train_start:config["train_end"]]
    X_test = X_full_set.loc[config["test_start"]+timedelta(days=day):
                    config["test_start"]+timedelta(days=day+forecast_steps)]
    y_train = y_full_set.loc[train_start:config["train_end"]]
    y_test = y_full_set.loc[config["test_start"]+timedelta(days=day):
                    config["test_start"]+timedelta(days=day+forecast_steps)]
    
    return X_train, X_test, y_train, y_test

In [None]:
def standardize_mean_values(day, df_test, df_train, full_set, config):
    
    agg_cols = [col for col in config['train_features'] if col.endswith('_lag_1')] + \
               ['diff_open_value', 'open-prev_close', 'diff_close_value', 'growth_open'] + \
               [col for col in config['train_features'] if "close_m" in col]
    
    for agg in agg_cols:
        if agg in df_test.columns:
            try:
                num = df_test.loc[config["test_start"]+timedelta(days=day), agg]

            except KeyError as e:
                num = df_train[agg].iloc[-1]

            finally:

                _df = df_test.loc[config["test_start"]+timedelta(days=day):, agg]
                _df = _df.replace(_df.values, num)

#                 print(df_test.loc[config["test_start"]+timedelta(days=day):, agg], _df.values.ravel())

                df_test.loc[config["test_start"]:, agg] = _df.values.ravel()
    
    if 'open' in config['train_features']:
        
        idx_1 = df_test.iloc[1].name
        df_test.loc[idx_1, 'open'] = full_set.loc[idx_1, 'new_open']
        
        for n in range(df_test.loc[idx_1+timedelta(days=1):].shape[0]):
            
            _date = df_test.loc[idx_1+timedelta(days=1):].iloc[n].name
            df_test.loc[_date, 'open'] = full_set.loc[df_test.iloc[1+n].name, 'open'] + full_set.loc[idx_1-timedelta(days=5):idx_1, 'diff_open_value_mean_3_days'].mean()
        
    
    
    return df_test

In [None]:
def add_predictions(day, model_name, df_preds, y_pred_df, y_test, config):
    
    dates = y_test.index
    
    for date in dates:
        step_day = int((date-(config["test_start"]+timedelta(days=day))).days)
        df_preds.loc[date.strftime("%Y-%m-%d"), f'd-{step_day}'] = y_pred_df.loc[date.strftime("%Y-%m-%d"), 0]
    
    return df_preds

In [None]:
def estimations(day, df_stats, y_pred_df, y_test, config, problem):
    
    dates = y_test.index
    
    for date in dates:
        step_day = int((date-(config["test_start"]+timedelta(days=day))).days)

        try:
            pred = y_pred_df.loc[date].values[0]
            real = y_test.loc[date].values[0]

            if problem == 'regression':
                err = abs(pred / real - 1) * 100

                df_stats.loc[date, f'd-{step_day}' + '_total_abs_error'] = np.round(abs(pred-real))
                df_stats.loc[date, f'd-{step_day}' + '_total_relative_error'] = np.round(abs(pred / real - 1), 4) * 100
                df_stats.loc[date, f'd-{step_day}' + '_more_5'] = 1 if (err > 5) else 0
                df_stats.loc[date, f'd-{step_day}' + '_more_10'] = 1 if (err > 10) else 0
                
            
            elif problem == 'classification':
                err = pred-real

                df_stats.loc[date, f'd-{step_day}' + '_is_true'] = 1 if (err == 0) else 0
                
                
        except ZeroDivisionError as e:
            print(e)

            df_stats.loc[date, :] = 0

        except KeyError as e:
            print(e)

            df_stats.loc[date, :] = 0
            
    
    return df_stats

In [None]:
def write_predictions(forecast_steps, df_preds, config, research_task_uuid, problem):
    
    for step in range(forecast_steps+1):
        try:
            pred = df_preds.loc[:, [f'd-{step}']].dropna().sort_index()
            pred.index.name = 'date_time'

            path_to_files = os.path.join(config['path_to_result'], "forecast", problem,
                                         config['company'], config['forecast_frequency'], config['model_name'], 
                                         f"research_task_{research_task_uuid}", 
                                         f"{config['model_name']}_{config['unique_uuid']}")
            if not os.path.isdir(path_to_files):
                os.makedirs(path_to_files)
                
            file_name = os.path.join(path_to_files, 
                    f"forecast_d-{step}_{config['model_name']}.csv")

            pd.DataFrame(pred).to_csv(file_name)

        except KeyError:
            pass

## CLASSIFICATION

__forecast whether the price will rise or fall in next several days__

In [None]:
problem = 'classification'
models_list = ['xgboost', 'lightgbm', 'random_forest', 'knear_neighbors']

## REGRESSION

__forecast the price itself for next several days__

In [22]:
problem = 'regression'
models_list = ['xgboost', 'lightgbm', 'random_forest', 'linear_regression']

In [49]:
models = {
    'amazon': '1e38d412-0dc3-11ef-8dc7-c0e434d84b22',
    'apple': '29753b7f-0dc3-11ef-b287-c0e434d84b22',
    'meta': '3d63daab-0dc3-11ef-966d-c0e434d84b22',
    'google': '343a246e-0dc3-11ef-9323-c0e434d84b22',
    'netflix': '462af9c7-0dc3-11ef-ab57-c0e434d84b22'
}

paths_to_configs = []
configs = []

for i in range(len(models.items())):  
    comp = list(models.keys())[i]
    paths_to_configs += glob(f'/diploma_info/datalake/wf_result/{problem}/{comp}/daily/*/research_task_*/*_{models[comp]}.yaml')

print(paths_to_configs[0])
    
for file in paths_to_configs: 
    with open(file, 'r') as f:
        configs += yaml.safe_load(f)

/diploma_info/datalake/wf_result/regression/amazon/daily\linear_regression\research_task_1e38d407-0dc3-11ef-8639-c0e434d84b22\linear_regression_1e38d412-0dc3-11ef-8dc7-c0e434d84b22.yaml


In [44]:
configs

['company',
 'forecast_frequency',
 'forecast_periods',
 'model_hyperparameters',
 'model_name',
 'path_to_result',
 'problem',
 'target_column',
 'test_end',
 'test_start',
 'train_end',
 'train_features',
 'train_start',
 'unique_uuid',
 'company',
 'forecast_frequency',
 'forecast_periods',
 'model_hyperparameters',
 'model_name',
 'path_to_result',
 'problem',
 'target_column',
 'test_end',
 'test_start',
 'train_end',
 'train_features',
 'train_start',
 'unique_uuid',
 'company',
 'forecast_frequency',
 'forecast_periods',
 'model_hyperparameters',
 'model_name',
 'path_to_result',
 'problem',
 'target_column',
 'test_end',
 'test_start',
 'train_end',
 'train_features',
 'train_start',
 'unique_uuid',
 'company',
 'forecast_frequency',
 'forecast_periods',
 'model_hyperparameters',
 'model_name',
 'path_to_result',
 'problem',
 'target_column',
 'test_end',
 'test_start',
 'train_end',
 'train_features',
 'train_start',
 'unique_uuid',
 'company',
 'forecast_frequency',
 'forecas