In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

from pylab import rcParams
rcParams['figure.figsize'] = 12,8

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns


import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import re

import os
from glob import glob
from tqdm import tqdm

import yaml
from yaml import dump
import uuid
import itertools

import matplotlib.pyplot as plt

In [3]:
from sklearn.neighbors import KNeighborsClassifier

In [4]:
import warnings
warnings.simplefilter(action="ignore")

## Forecasting

#### initialize all required valiables, prepare datasets

In [5]:
def init(company, period):

    train_end = datetime(2023, 12, 31)
    test_start = datetime(2024, 1, 1)
    test_end = datetime(2024, 2, 8)
    
    train_features_set = [
        ['close_lag_1', 'volume_lag_1']
    ]

    
    date_parse = lambda dates: pd.to_datetime(dates)
    path = f"/diploma_info/datalake/raw_data/{company}_{period}.csv"
    
    full_set = pd.read_csv(
        path,
        parse_dates=["Date"],
        date_parser=date_parse,
        index_col=["Date"],
    )
    full_set.index.name = 'date'
    full_set.columns = [c.lower() for c in full_set.columns]
    
    full_set = create_target_value(full_set)
    full_set = create_new_features(full_set, test_end)

    full_set = full_set.fillna(0)
    
    
    test_start = datetime(full_set.loc[test_start:].index[0].year, full_set.loc[test_start:].index[0].month, full_set.loc[test_start:].index[0].day)
    train_end = datetime(full_set.loc[:train_end].index[-1].year, full_set.loc[:train_end].index[-1].month, full_set.loc[:train_end].index[-1].day)
    
    
    return full_set, train_end, test_start, test_end, train_features_set

In [6]:
def create_target_value(df):

    growth = [0]
    diff_value = [0]


    for k in range(1, df.shape[0]):
        diff_value.append(df.iloc[k]["close"] - df.iloc[k-1]["close"])
        if diff_value[-1] > 0:
            growth.append(1)
        else:
            growth.append(0)

    df['diff_value'] = diff_value
    df['growth'] = growth
    
    return df

In [7]:
def create_new_features(df, test_end):
    
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['day_of_week'] = df.index.weekday
    df['week_of_year'] = (df.index.isocalendar()['week']).astype('int')
    df['close_lag_1'] = df['close'].shift(1).bfill()
    df['volume_lag_1'] = df['volume'].shift(1).bfill()

    for window in [3, 5, 7]:
        close_agg = pd.DataFrame(round(df['close'].rolling(window=window, closed='left').agg(
            ('max', 'min', 'mean')
        )))
        close_agg.columns = [f'close_max_{window}_days', f'close_min_{window}_days', f'close_mean_{window}_days']
        day_mean = close_agg.reset_index()[['date', f'close_max_{window}_days', f'close_min_{window}_days', f'close_mean_{window}_days']]

        df = df.reset_index().merge(day_mean, on='date').set_index("date")
        df = df.loc[:test_end.strftime("%Y%m%d"),]
    
    return df

In [8]:
def define_parameters(company, period, train_end, test_start, test_end, train_features_set, forecast_steps, models_dict, problem):
    
    list_of_configs = []
    
    model = None
    
#     for duration in [90, 240]:
    for md in models_dict.values():

        if md == 'knear_neighbors':
            model = KNeighborsClassifier(n_jobs=-1)
        else:
            print('Unknown model')

        for train_features in train_features_set:
            config = {
                'unique_uuid': str(uuid.uuid1()),
#                     'train_start': datetime(2020, 1, 1),
                'train_end': train_end,
                'test_start': test_start,
                'test_end': test_end,
#                     'duration_training_history': duration,
                'target_column': 'growth',
                'train_features': train_features,
                'path_to_result': f'/diploma_info/datalake/',
                'forecast_periods': forecast_steps,
#                 'hour_mean_value': 5,
                'model_name': md,
                'model': model,
                'forecast_frequency': period.lower(),
                'company': company.lower(),
                'model_hyperparameters': model.get_params(),
                'problem': problem
            }

            list_of_configs.append(config.copy())
    
    return list_of_configs

#### functions used in wfv service

In [9]:
def data(day, X_full_set, y_full_set, train_start, config, forecast_steps):


    X_train = X_full_set.loc[train_start:config["train_end"]]
    X_test = X_full_set.loc[config["test_start"]+timedelta(days=day):
                    config["test_start"]+timedelta(days=day+forecast_steps)]
    y_train = y_full_set.loc[train_start:config["train_end"]]
    y_test = y_full_set.loc[config["test_start"]+timedelta(days=day):
                    config["test_start"]+timedelta(days=day+forecast_steps)]
    
    return X_train, X_test, y_train, y_test

In [10]:
def standardize_mean_values(day, df_test, df_train, config):
    
    agg_cols = ['close_lag_1', 'volume_lag_1'] + [col for col in config['train_features'] if "close_m" in col]
    
    for agg in agg_cols:
        if agg in df_test.columns:
            try:
                num = df_test.loc[config["test_start"]+timedelta(days=day), agg]

            except KeyError as e:
                num = df_train[agg].iloc[-1]

            finally:

                _df = df_test.loc[config["test_start"]+timedelta(days=day):, agg]
                _df = _df.replace(_df.values, num)

#                 print(df_test.loc[config["test_start"]+timedelta(days=day):, agg], _df.values.ravel())

                df_test.loc[config["test_start"]:, agg] = _df.values.ravel()
    
    return df_test

In [11]:
def add_predictions(day, model_name, df_preds, y_pred_df, y_test, config):
    
    dates = y_test.index
    
    for date in dates:
        step_day = int((date-(config["test_start"]+timedelta(days=day))).days)
        df_preds.loc[date.strftime("%Y-%m-%d"), f'd-{step_day}'] = y_pred_df.loc[date.strftime("%Y-%m-%d"), 0]
    
    return df_preds

In [12]:
def estimations(day, df_stats, y_pred_df, y_test, config):
    
    dates = y_test.index
    
    for date in dates:
        step_day = int((date-(config["test_start"]+timedelta(days=day))).days)

        try:
            pred = y_pred_df.loc[date].values[0]
            real = y_test.loc[date].values[0]

            err = pred-real

            df_stats.loc[date, f'd-{step_day}' + '_is_true'] = 1 if (err == 0) else 0
            
        except ZeroDivisionError as e:
            print(e)

            df_stats.loc[date, f'd-{step_day}' + '_is_true'] = 0

        except KeyError as e:
            print(e)

            df_stats.loc[date, f'd-{step_day}' + '_is_true'] = 0
    
    return df_stats

In [13]:
def write_predictions(forecast_steps, df_preds, config, research_task_uuid, problem):
    
    for step in range(forecast_steps+1):
        try:
            pred = df_preds.loc[:, [f'd-{step}']].dropna().sort_index()
            pred.index.name = 'date_time'

            path_to_files = os.path.join(config['path_to_result'], "forecast", problem,
                                         config['company'], config['forecast_frequency'], config['model_name'], 
                                         f"research_task_{research_task_uuid}", 
                                         f"{config['model_name']}_{config['unique_uuid']}")
            if not os.path.isdir(path_to_files):
                os.makedirs(path_to_files)
                
            file_name = os.path.join(path_to_files, 
                    f"forecast_d-{step}_{config['model_name']}.csv")

            pd.DataFrame(pred).to_csv(file_name)

        except KeyError:
            pass

### wfv service

In [27]:
def run_wfv(full_set: pd.DataFrame, config: dict, research_task_uuid: str, forecast_steps: int, company: str, models_dict: dict, problem: str):
    
    X_full_set = full_set.loc[:, config['train_features']]
    y_full_set = full_set.loc[:, [config['target_column']]]
    
    if X_full_set.shape[0] != y_full_set.shape[0]:
        common_index = list(set(X_full_set.index) & set(y_full_set.index))
        common_index.sort()
        X_full_set = X_full_set.loc[common_index, :]
        y_full_set = y_full_set.loc[common_index, :]
    print(X_full_set.shape, y_full_set.shape)
    

    df_preds = pd.DataFrame()
    df_stats = pd.DataFrame()

    count_days = (test_end - test_start).days + 1
    
    
    model_name = config['model_name']
    print(model_name)
    
    model = config['model']

    unique_uuid = config['unique_uuid']
    
    if not os.path.isdir(config['path_to_result']):
        os.makedirs(config['path_to_result'])

    path_folder_result = os.path.join(config['path_to_result'], "wf_result", problem, config['company'], 
                                      config['forecast_frequency'], model_name,
                                      f"research_task_{research_task_uuid}")
    if not os.path.isdir(path_folder_result):
        os.makedirs(path_folder_result)
        
        

    for day in tqdm(range(count_days)):
        
        train_start = config.get('train_start', None)
        if train_start is None:
            if config.get('duration_training_history', None) is None:
                train_start = X_full_set.index[0]
                config['train_start'] = datetime(train_start.year, train_start.month, train_start.day)
            else:
                train_start = config['train_end'] + timedelta(days=i - config['duration_training_history'])

        try:

            X_train, X_test, y_train, y_test = data(day, X_full_set, y_full_set, train_start, config, forecast_steps)
    #         print(X_train, y_train)
            X_test = standardize_mean_values(day, X_test.copy(), X_train, config)
    #             print(test_start+timedelta(days=day), 'x_test: ', X_test.head(2))

            y_pred = model.fit(X_train, y_train).predict(X_test)


            y_pred_df = pd.DataFrame(y_pred, index=y_test.index)
            df_preds = add_predictions(day, model_name, df_preds, y_pred_df, y_test, config)

#             print(df_preds)

            df_stats = estimations(day, df_stats, y_pred_df, y_test, config)
    #         print('\n\n')
        


        except KeyError as e:
            print(e)
            continue
            
        except ValueError as e:
            print(e)
            continue


    print("\033[1m\033[34mAccuracy:", round(abs(df_stats['d-0_is_true'].dropna().sum() / df_stats['d-0_is_true'].dropna().shape[0]) * 100, 2))

    write_predictions(forecast_steps, df_preds, config, research_task_uuid, problem)


    last_index = df_stats.index[-1]
    df_stats.loc[last_index, 'model_hyperparameters'] = str(config['model_hyperparameters'])
    df_stats.loc[last_index, 'train_features'] = str(config['train_features'])
    
    path_to_save_result_csv = os.path.join(path_folder_result, f'{model_name}_{unique_uuid}.csv')
    df_stats.round(2).to_csv(path_to_save_result_csv, date_format='%Y-%m-%d %H:%M:%S')
    
    config_to_save = config.copy()
    config_to_save.pop('model', None)
    with open(os.path.join(path_folder_result, f'{model_name}_{unique_uuid}.yaml'), 'w') as outfile:
        dump(config_to_save, outfile, default_flow_style=False)

In [28]:
companies = ["GOOGLE"]
time_period = ["daily"]

problem = 'classification'
models_list = ['KNear_Neighbors']
models_dict = dict([("".join(re.findall('([A-Z])', k)).lower(), k.lower()) for k in models_list])

forecast_steps = 4        # means that forecast will be made on {n} futute periods 
 

for company, period in list(itertools.product(companies, time_period)):
    
    full_set, train_end, test_start, test_end, train_features_set = init(company, period)
    print(f'company: {company}\t period: {period}\t train ends: {train_end}\t test starts: {test_start}\n')
    
    _research_task_uuid = str(uuid.uuid1())
    print(f'_research_task_uuid = {_research_task_uuid}\n')
    
    configs = define_parameters(company, period, train_end, test_start, test_end, train_features_set, forecast_steps, models_dict, problem)
    print(f'count_configs {len(configs)} \n')
    
    for _ in configs:
        print(_['model_name'], '==', _, '\n')
        
        run_wfv(full_set, _, _research_task_uuid, forecast_steps, company, models_dict, problem)

company: GOOGLE	 period: daily	 train ends: 2023-12-29 00:00:00	 test starts: 2024-01-02 00:00:00

knear_neighbors == {'unique_uuid': '0667d6a2-f5dc-11ee-bdc9-c0e434d84b22', 'train_end': datetime.datetime(2023, 12, 29, 0, 0), 'test_start': datetime.datetime(2024, 1, 2, 0, 0), 'test_end': datetime.datetime(2024, 2, 8, 0, 0), 'target_column': 'growth', 'train_features': ['close_lag_1', 'volume_lag_1'], 'path_to_result': '/diploma_info/datalake/', 'forecast_periods': 4, 'model_name': 'knear_neighbors', 'model': KNeighborsClassifier(n_jobs=-1), 'forecast_frequency': 'daily', 'company': 'google', 'model_hyperparameters': {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': -1, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}, 'problem': 'classification'} 

(4902, 2) (4902, 1)
knear_neighbors


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:01<00:00, 31.57it/s]

[1m[34mAccuracy: 48.15



