In [1]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import itertools

In [2]:
import warnings
warnings.simplefilter(action="ignore")

In [3]:
companies = ["AMAZON", "APPLE", "GOOGLE", "META", "NETFLIX"]
time_period = ["daily"]

In [4]:
for company, period in list(itertools.product(companies, time_period)):
    
    date_parse = lambda dates: pd.to_datetime(dates)
    path = f"/diploma_info/datalake/raw_data/{company}_{period}.csv"

    full_set = pd.read_csv(
        path,
        parse_dates=["Date"],
        date_parser=date_parse,
        index_col=["Date"],
    )

    full_set.index.name = 'date'
    full_set.columns = ["_".join(c.lower().split()) for c in full_set.columns]

    
    full_set['diff_close_value'] = full_set['close'].diff()
    full_set['diff_open_value'] = full_set['open'].diff()
    full_set['open-prev_close'] = full_set['open'] - full_set['close'].shift(1)
    full_set['close-open_lag_1'] = (full_set['close'] - full_set['open']).shift(1)
    
    full_set.loc[full_set[full_set['diff_close_value'] > 0].index, 'growth'] = 1
    full_set.loc[full_set[full_set['diff_open_value'] > 0].index, 'growth_open'] = 1
    full_set['growth_lag_1'] = full_set['growth'].shift(1).bfill()
    
    full_set[f'diff_open_value_mean_3_days'] = abs(full_set['diff_open_value'].rolling(window=3, closed='left').agg('mean'))
    full_set.loc[full_set['growth_open'] == 0, f'diff_open_value_mean_3_days'] = \
    -(full_set.loc[full_set['growth_open'] == 0, f'diff_open_value_mean_3_days'].values)
    full_set['new_open'] = full_set['open'].shift(1) + full_set['diff_open_value_mean_3_days']
    

    full_set['year'] = full_set.index.year
    full_set['month'] = full_set.index.month
    full_set['day'] = full_set.index.day
    full_set['day_of_week'] = full_set.index.weekday
    full_set['week_of_year'] = (full_set.index.isocalendar()['week']).astype('int')
    for i in range(1, 8):
        full_set[f'close_lag_{i}'] = full_set['close'].shift(i).bfill()
        full_set[f'volume_lag_{i}'] = full_set['volume'].shift(i).bfill()
        full_set[f'diff_close_value_lag_{i}'] = full_set['diff_close_value'].shift(i).bfill()

    for window in [3, 5, 7]:
        close_agg = pd.DataFrame(round(full_set['close'].rolling(window=window, closed='left').agg(
            ('max', 'min', 'mean')
        )))
        close_agg.columns = [f'close_max_{window}_days', f'close_min_{window}_days', f'close_mean_{window}_days']
        day_mean = close_agg.reset_index()[['date', f'close_max_{window}_days', 
                                            f'close_min_{window}_days', f'close_mean_{window}_days']]

        full_set = full_set.reset_index().merge(day_mean, on='date').set_index("date")
#         full_set = full_set.loc[:test_end.strftime("%Y%m%d"),]

    full_set = full_set.fillna(0)
      
        
    path_to_curr = f"/diploma_info/datalake/currency_rates.csv"
    currencies = pd.read_csv(
        path_to_curr,
        parse_dates=["date"],
        date_parser=date_parse,
        index_col=["date"],
    )
    
    for col in currencies.columns:
        currencies[f'{col}_lag_1'] = currencies[col].shift(1)
        currencies = currencies.drop([col], axis=1)
    currencies = currencies.dropna()
    
    full_set = pd.merge(full_set, currencies, on='date')
    
    
    path_to_inf = f"/diploma_info/datalake/usa_inflation_rate.csv"
    inflation = pd.read_csv(
        path_to_inf,
        parse_dates=["date"],
        date_parser=date_parse,
        index_col=["date"],
    )
    for idx in inflation.index:
        full_set.loc[idx, 'usa_inflation_%'] = inflation.loc[idx, 'value']
    full_set = full_set.sort_index()

    full_set['usa_inflation_%'] = full_set['usa_inflation_%'].ffill()
    full_set = full_set.dropna()

    
    full_set.to_csv(f"/diploma_info/datalake/processed_data/{company}_{period}.csv")

In [5]:
full_set

Unnamed: 0_level_0,open,high,low,close,adj_close,volume,diff_close_value,diff_open_value,open-prev_close,close-open_lag_1,...,close_max_7_days,close_min_7_days,close_mean_7_days,1_USD_to_EUR_lag_1,1_EUR_to_USD_lag_1,1_UAH_to_USD_lag_1,1_USD_to_UAH_lag_1,1_EUR_to_UAH_lag_1,1_UAH_to_EUR_lag_1,usa_inflation_%
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,49.151428,50.331429,48.731430,49.848572,49.848572,13475000.0,1.047142,0.118572,0.349998,-0.231426,...,49.0,48.0,49.0,0.8265,1.2098,0.06322,15.819,19.138,0.05225,0.76
2015-01-05,49.258572,49.258572,47.147144,47.311428,47.311428,18165000.0,-2.537144,0.107143,-0.590000,0.697144,...,50.0,48.0,49.0,0.8370,1.1946,0.06265,15.962,18.900,0.05243,0.76
2015-01-06,47.347141,47.639999,45.661430,46.501431,46.501431,16037700.0,-0.809998,-1.911430,0.035713,-1.947144,...,50.0,47.0,49.0,0.8391,1.1918,0.06322,15.818,18.852,0.05305,0.76
2015-01-07,47.347141,47.421429,46.271427,46.742859,46.742859,9849700.0,0.241428,0.000000,0.845711,-0.845711,...,50.0,47.0,48.0,0.8377,1.1937,0.06317,15.830,18.896,0.05292,0.76
2015-01-08,47.119999,47.835712,46.478573,47.779999,47.779999,9601900.0,1.037140,-0.227142,0.377140,-0.604282,...,50.0,47.0,48.0,0.8461,1.1819,0.06317,15.830,18.709,0.05345,0.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-02,556.020020,565.460022,555.590027,565.150024,565.150024,3068200.0,13.440002,8.179993,4.309998,3.869995,...,578.0,551.0,560.0,0.9330,1.0718,0.02526,39.594,42.436,0.02356,3.50
2024-05-03,566.000000,580.260010,565.159973,579.340027,579.340027,3307500.0,14.190002,9.979980,0.849976,9.130005,...,565.0,551.0,558.0,0.9320,1.0729,0.02529,39.539,42.422,0.02357,3.50
2024-05-06,581.820007,597.340027,580.250000,596.969971,596.969971,3686300.0,17.629944,15.820007,2.479980,13.340027,...,579.0,551.0,562.0,0.9287,1.0768,0.02536,39.428,42.458,0.02355,3.50
2024-05-07,596.280029,606.049988,591.320007,606.000000,606.000000,3614100.0,9.030029,14.460022,-0.689941,15.149963,...,597.0,551.0,566.0,0.9285,1.0771,0.02545,39.293,42.320,0.02363,3.50


In [6]:
full_set.describe()

Unnamed: 0,open,high,low,close,adj_close,volume,diff_close_value,diff_open_value,open-prev_close,close-open_lag_1,...,close_max_7_days,close_min_7_days,close_mean_7_days,1_USD_to_EUR_lag_1,1_EUR_to_USD_lag_1,1_UAH_to_USD_lag_1,1_USD_to_UAH_lag_1,1_EUR_to_UAH_lag_1,1_UAH_to_EUR_lag_1,usa_inflation_%
count,2353.0,2353.0,2353.0,2353.0,2353.0,2353.0,2353.0,2353.0,2353.0,2353.0,...,2353.0,2353.0,2353.0,2353.0,2353.0,2353.0,2353.0,2353.0,2353.0,2353.0
mean,308.990416,313.653089,304.194603,309.043073,309.043073,9303919.0,0.238278,0.234848,0.185622,0.049226,...,318.571186,297.629834,308.120272,0.893525,1.121666,0.036385,28.245858,31.57829,0.032424,2.864237
std,163.375565,165.389255,161.193661,163.281596,163.281596,8131240.0,9.14619,8.902855,5.753512,6.901347,...,167.587299,158.663375,162.92024,0.042326,0.052819,0.005961,4.823893,4.83533,0.005092,2.3657
min,46.021427,46.785713,45.264286,45.547142,45.547142,1144000.0,-122.419983,-117.320007,-107.820007,-42.47998,...,48.0,46.0,47.0,0.7993,0.9596,0.02513,15.735,17.627,0.02337,-0.2
25%,150.339996,152.820007,148.309998,151.029999,151.029999,4634100.0,-3.130005,-3.090004,-1.040009,-2.829987,...,153.0,146.0,149.0,0.8614,1.0866,0.03517,25.788,28.694,0.02989,1.37
50%,318.160004,323.880005,312.799988,317.940002,317.940002,6960200.0,0.090012,0.130005,0.160004,-0.059998,...,330.0,306.0,318.0,0.8939,1.1187,0.03705,26.992,31.007,0.03225,2.18
75%,429.730011,437.269989,425.529999,429.480011,429.480011,11221000.0,3.639984,3.809998,1.62999,3.070007,...,441.0,414.0,428.0,0.9203,1.161,0.03878,28.43,33.458,0.03485,3.5
max,692.349976,700.98999,686.090027,691.690002,691.690002,133387500.0,84.570038,64.419983,63.649994,55.030029,...,692.0,663.0,680.0,1.0421,1.2511,0.06355,39.795,42.794,0.05659,9.06


In [7]:
full_set[full_set['open-prev_close'] == full_set['open-prev_close'].min()]

Unnamed: 0_level_0,open,high,low,close,adj_close,volume,diff_close_value,diff_open_value,open-prev_close,close-open_lag_1,...,close_max_7_days,close_min_7_days,close_mean_7_days,1_USD_to_EUR_lag_1,1_EUR_to_USD_lag_1,1_UAH_to_USD_lag_1,1_USD_to_UAH_lag_1,1_EUR_to_UAH_lag_1,1_UAH_to_EUR_lag_1,usa_inflation_%
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-21,400.429993,409.149994,379.98999,397.5,397.5,58904300.0,-110.75,-117.320007,-107.820007,-9.5,...,541.0,508.0,523.0,0.8839,1.1313,0.0353,28.327,32.047,0.0312,7.04


In [8]:
full_set[~((full_set['diff_close_value'] < -15) | (full_set['diff_close_value'] > 15))] \
.mean()[['diff_close_value', 'open-prev_close', 'close-open_lag_1']]

diff_close_value    0.176642
open-prev_close     0.197364
close-open_lag_1    0.088179
dtype: float64

In [9]:
full_set[~((full_set['open-prev_close'] < -10) | (full_set['open-prev_close'] > 10))] \
.mean()[['diff_close_value', 'open-prev_close', 'close-open_lag_1']]

diff_close_value    0.246002
open-prev_close     0.214384
close-open_lag_1    0.090087
dtype: float64

In [10]:
full_set[~((full_set['close-open_lag_1'] < -15) | (full_set['close-open_lag_1'] > 15))] \
.mean()[['diff_close_value', 'open-prev_close', 'close-open_lag_1']]

diff_close_value    0.329098
open-prev_close     0.231436
close-open_lag_1    0.042543
dtype: float64