In [1]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import itertools

In [2]:
import warnings
warnings.simplefilter(action="ignore")

In [3]:
companies = ["AMAZON", "APPLE", "GOOGLE", "META", "NETFLIX"]
time_period = ["daily"]

In [4]:
for company, period in list(itertools.product(companies, time_period)):
    
    date_parse = lambda dates: pd.to_datetime(dates)
    path = f"/diploma_info/datalake/raw_data/{company}_{period}.csv"

    full_set = pd.read_csv(
        path,
        parse_dates=["Date"],
        date_parser=date_parse,
        index_col=["Date"],
    )


    full_set.index.name = 'date'
    full_set.columns = [c.lower() for c in full_set.columns]

    growth = [0]
    diff_value = [0]


    for k in range(1, full_set.shape[0]):
        diff_value.append(full_set.iloc[k]["close"] - full_set.iloc[k-1]["close"])
        if diff_value[-1] > 0:
            growth.append(1)
        else:
            growth.append(0)

    full_set['diff_value'] = diff_value
    full_set['growth'] = growth

    full_set['year'] = full_set.index.year
    full_set['month'] = full_set.index.month
    full_set['day'] = full_set.index.day
    full_set['day_of_week'] = full_set.index.weekday
    full_set['week_of_year'] = (full_set.index.isocalendar()['week']).astype('int')
    for i in range(1, 8):
        full_set[f'close_lag_{i}'] = full_set['close'].shift(i).bfill()
        full_set[f'volume_lag_{i}'] = full_set['volume'].shift(i).bfill()
        full_set[f'diff_value_lag_{i}'] = full_set['diff_value'].shift(i).bfill()
        
    path_to_curr = f"/diploma_info/datalake/currency_rates.csv"

    currencies = pd.read_csv(
        path_to_curr,
        parse_dates=["date"],
        date_parser=date_parse,
        index_col=["date"],
    )
    for col in currencies.columns:
        currencies[f'{col}_lag_1'] = currencies[col].shift(1)
        currencies = currencies.drop([col], axis=1)

    currencies = currencies.dropna()
    
    full_set = pd.merge(full_set, currencies, on='date')

    for window in [3, 5, 7]:
        close_agg = pd.DataFrame(round(full_set['close'].rolling(window=window, closed='left').agg(
            ('max', 'min', 'mean')
        )))
        close_agg.columns = [f'close_max_{window}_days', f'close_min_{window}_days', f'close_mean_{window}_days']
        day_mean = close_agg.reset_index()[['date', f'close_max_{window}_days', f'close_min_{window}_days', f'close_mean_{window}_days']]

        full_set = full_set.reset_index().merge(day_mean, on='date').set_index("date")
#         full_set = full_set.loc[:test_end.strftime("%Y%m%d"),]


    full_set = full_set.fillna(0)
     
    full_set.to_csv(f"/diploma_info/datalake/processed_data/{company}_{period}.csv")

In [5]:
full_set

Unnamed: 0_level_0,open,high,low,close,adj close,volume,diff_value,growth,year,month,...,1_UAH_to_EUR_lag_1,close_max_3_days,close_min_3_days,close_mean_3_days,close_max_5_days,close_min_5_days,close_mean_5_days,close_max_7_days,close_min_7_days,close_mean_7_days
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,49.151428,50.331429,48.731430,49.848572,49.848572,13475000,1.047142,1,2015,1,...,0.05225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-05,49.258572,49.258572,47.147144,47.311428,47.311428,18165000,-2.537144,0,2015,1,...,0.05243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-06,47.347141,47.639999,45.661430,46.501431,46.501431,16037700,-0.809998,0,2015,1,...,0.05305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-07,47.347141,47.421429,46.271427,46.742859,46.742859,9849700,0.241428,1,2015,1,...,0.05292,50.0,47.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-08,47.119999,47.835712,46.478573,47.779999,47.779999,9601900,1.037140,1,2015,1,...,0.05345,47.0,47.0,47.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-26,583.559998,590.750000,583.270020,587.650024,587.650024,2393700,4.090027,1,2024,2,...,0.02411,588.0,573.0,582.0,588.0,573.0,581.0,593.0,573.0,582.0
2024-02-27,595.000000,605.359985,592.330017,601.669983,601.669983,4489400,14.019958,1,2024,2,...,0.02401,588.0,584.0,587.0,588.0,573.0,582.0,593.0,573.0,584.0
2024-02-28,595.789978,598.169983,590.010010,596.479980,596.479980,2605200,-5.190002,0,2024,2,...,0.02410,602.0,584.0,591.0,602.0,573.0,587.0,602.0,573.0,585.0
2024-02-29,604.250000,604.520020,595.159973,602.919983,602.919983,3572100,6.440002,1,2024,2,...,0.02423,602.0,588.0,595.0,602.0,584.0,592.0,602.0,573.0,587.0
