In [1]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import itertools

In [2]:
from bs4 import BeautifulSoup as BS
import requests

In [3]:
import warnings
warnings.simplefilter(action="ignore")

In [4]:
companies = ["AMAZON", "APPLE", "GOOGLE", "META", "NETFLIX"]
    
URLS = ['https://finance.yahoo.com/quote/AMZN', 'https://finance.yahoo.com/quote/AAPL',
        'https://finance.yahoo.com/quote/GOOG', 'https://finance.yahoo.com/quote/META', 
        'https://finance.yahoo.com/quote/NFLX']

time_period = ["daily"]

In [5]:
for n, (company, period) in enumerate(list(itertools.product(companies, time_period))):
    
    date_parse = lambda dates: pd.to_datetime(dates)
    path = f"/diploma_info/datalake/raw_data/{company}_{period}.csv"

    full_set = pd.read_csv(
        path,
        parse_dates=["Date"],
        date_parser=date_parse,
        index_col=["Date"],
    )
#     .loc[:'2024-05-15']
    
    date_1 = full_set.index[-1]
    if (date_1+timedelta(days=2)).weekday() == 5 or \
       (date_1+timedelta(days=2)).weekday() == 6:
        date_2 = date_1 + timedelta(days=4)
    else:
        date_2 = date_1 + timedelta(days=2)
        
    new_dates = pd.date_range(date_1, date_2, freq='B')[1:]
    full_set = pd.concat([full_set, pd.DataFrame(None, index=new_dates)])
    
    full_set.index.name = 'date'
    full_set.columns = ["_".join(c.lower().split()) for c in full_set.columns]

    req_1 = requests.get(URLS[n])
    soup = BS(req_1.text, "html.parser")
    full_set.loc[date_2, 'open'] = float(soup.find_all('fin-streamer', class_="svelte-tx3nkj")[1].text.strip())
    full_set.loc[new_dates[0], 'open'] = full_set.shift(1).loc[new_dates[0], 'close']

        
    full_set['diff_close_value'] = full_set['close'].diff()
    full_set['diff_open_value'] = full_set['open'].diff()
    full_set['open-prev_close'] = full_set['open'] - full_set['close'].shift(1)
    full_set['(close-open)_lag_1'] = (full_set['close'] - full_set['open']).shift(1)
    
    full_set.loc[full_set[full_set['diff_close_value'] > 0].index, 'growth'] = 1
    full_set.loc[full_set[full_set['diff_open_value'] > 0].index, 'growth_open'] = 1
    full_set['growth_lag_1'] = full_set['growth'].shift(1).bfill()
    
    full_set[f'diff_open_value_mean_3_days'] = abs(full_set['diff_open_value'].rolling(window=3, closed='left').agg('mean'))
    full_set.loc[full_set['growth_open'] == 0, f'diff_open_value_mean_3_days'] = \
    -(full_set.loc[full_set['growth_open'] == 0, f'diff_open_value_mean_3_days'].values)
    full_set['new_open'] = full_set['open'].shift(1) + full_set['diff_open_value_mean_3_days']
    

    full_set['year'] = full_set.index.year
    full_set['month'] = full_set.index.month
    full_set['day'] = full_set.index.day
    full_set['day_of_week'] = full_set.index.weekday
    full_set['week_of_year'] = (full_set.index.isocalendar()['week']).astype('int')
    for i in range(1, 8):
        full_set[f'close_lag_{i}'] = full_set['close'].shift(i).bfill()
        full_set[f'volume_lag_{i}'] = full_set['volume'].shift(i).bfill()
        full_set[f'diff_close_value_lag_{i}'] = full_set['diff_close_value'].shift(i).bfill()
    full_set['growth_lag_7'] = full_set['growth'].shift(7).bfill()
    
    
    for window in [3, 5, 7]:
        close_agg = pd.DataFrame(round(full_set['close'].rolling(window=window, closed='left').agg(
            ('max', 'min', 'mean')
        )))
        close_agg.columns = [f'close_max_{window}_days', f'close_min_{window}_days', f'close_mean_{window}_days']
        
        full_set = full_set = pd.concat([full_set, close_agg], axis=1)
#         full_set = full_set.loc[:test_end.strftime("%Y%m%d"),]
      
        
    path_to_curr = f"/diploma_info/datalake/currency_rates.csv"
    currencies = pd.read_csv(
        path_to_curr,
        parse_dates=["date"],
        date_parser=date_parse,
        index_col=["date"],
    )
    
    currencies = pd.concat([currencies.dropna(), pd.DataFrame(None, index=new_dates)])
    currencies = currencies[~currencies.index.duplicated(keep='first')]
    currencies.index.name = 'date'
    
    for col in currencies.columns:
        currencies[f'{col}_lag_1'] = currencies[col].shift(1)
        currencies = currencies.drop([col], axis=1)
        
    full_set = pd.merge(full_set, currencies, on='date')    
    
    full_set = full_set.fillna(0)

    
    path_to_inf = f"/diploma_info/datalake/usa_inflation_rate.csv"
    inflation = pd.read_csv(
        path_to_inf,
        parse_dates=["date"],
        date_parser=date_parse,
        index_col=["date"],
    )
    for idx in inflation.index:
        full_set.loc[idx, 'usa_inflation_%'] = inflation.loc[idx, 'value']
    full_set = full_set.sort_index()

    full_set['usa_inflation_%'] = full_set['usa_inflation_%'].ffill()
    full_set = full_set.dropna().sort_index()   
        
    full_set.to_csv(f"/diploma_info/datalake/processed_data/{company}_{period}.csv")

In [6]:
full_set

Unnamed: 0_level_0,open,high,low,close,adj_close,volume,diff_close_value,diff_open_value,open-prev_close,(close-open)_lag_1,...,close_max_7_days,close_min_7_days,close_mean_7_days,1_USD_to_EUR_lag_1,1_EUR_to_USD_lag_1,1_UAH_to_USD_lag_1,1_USD_to_UAH_lag_1,1_EUR_to_UAH_lag_1,1_UAH_to_EUR_lag_1,usa_inflation_%
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,49.151428,50.331429,48.731430,49.848572,49.848572,13475000.0,1.047142,0.118572,0.349998,-0.231426,...,49.0,48.0,49.0,0.8265,1.2098,0.06322,15.819,19.138,0.05225,0.76
2015-01-05,49.258572,49.258572,47.147144,47.311428,47.311428,18165000.0,-2.537144,0.107143,-0.590000,0.697144,...,50.0,48.0,49.0,0.8370,1.1946,0.06265,15.962,18.900,0.05243,0.76
2015-01-06,47.347141,47.639999,45.661430,46.501431,46.501431,16037700.0,-0.809998,-1.911430,0.035713,-1.947144,...,50.0,47.0,49.0,0.8391,1.1918,0.06322,15.818,18.852,0.05305,0.76
2015-01-07,47.347141,47.421429,46.271427,46.742859,46.742859,9849700.0,0.241428,0.000000,0.845711,-0.845711,...,50.0,47.0,48.0,0.8377,1.1937,0.06317,15.830,18.896,0.05292,0.76
2015-01-08,47.119999,47.835712,46.478573,47.779999,47.779999,9601900.0,1.037140,-0.227142,0.377140,-0.604282,...,50.0,47.0,48.0,0.8461,1.1819,0.06317,15.830,18.709,0.05345,0.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-03,645.219971,646.780029,627.270020,633.789978,633.789978,2323500.0,-7.830017,1.209961,3.599976,-2.390015,...,655.0,636.0,645.0,0.9210,1.0858,0.02467,40.539,44.015,0.02272,3.40
2024-06-04,633.780029,635.229980,626.440002,631.619995,631.619995,3008200.0,-2.169983,-11.439941,-0.009949,-11.429993,...,655.0,634.0,644.0,0.9167,1.0908,0.02476,40.386,44.054,0.02270,3.40
2024-06-05,634.020020,651.250000,628.250000,650.270020,650.270020,2554700.0,18.650024,0.239990,2.400024,-2.160034,...,655.0,632.0,644.0,0.9191,1.0880,0.02481,40.302,43.848,0.02281,3.40
2024-06-06,650.270020,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,16.250000,0.000000,16.250000,...,655.0,632.0,644.0,0.9199,1.0871,0.02491,40.139,43.634,0.02292,3.40


In [7]:
full_set.describe()

Unnamed: 0,open,high,low,close,adj_close,volume,diff_close_value,diff_open_value,open-prev_close,(close-open)_lag_1,...,close_max_7_days,close_min_7_days,close_mean_7_days,1_USD_to_EUR_lag_1,1_EUR_to_USD_lag_1,1_UAH_to_USD_lag_1,1_USD_to_UAH_lag_1,1_EUR_to_UAH_lag_1,1_UAH_to_EUR_lag_1,usa_inflation_%
count,2374.0,2374.0,2374.0,2374.0,2374.0,2374.0,2374.0,2374.0,2374.0,2374.0,...,2374.0,2374.0,2374.0,2374.0,2374.0,2374.0,2374.0,2374.0,2374.0,2374.0
mean,311.858012,315.991949,306.499667,311.373041,311.373041,9245480.0,0.253357,0.250719,0.193041,0.060218,...,321.122999,300.115838,310.643218,0.893393,1.120875,0.036274,28.332468,31.663651,0.032332,2.868311
std,165.46535,167.444772,163.241165,165.350052,165.350052,8119584.0,9.138443,8.894363,5.737297,6.916962,...,169.507596,160.617418,164.865828,0.046036,0.057506,0.006071,4.954438,4.974405,0.005184,2.35559
min,46.021427,0.0,0.0,0.0,0.0,0.0,-122.419983,-117.320007,-107.820007,-42.47998,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.2
25%,152.025002,153.497505,149.387501,152.077503,152.077503,4609425.0,-3.130005,-3.085005,-1.040031,-2.837521,...,155.0,146.0,151.25,0.861925,1.086325,0.03504,25.81075,28.706,0.029822,1.37
50%,320.195007,325.36499,314.264999,320.050003,320.050003,6917250.0,0.079979,0.139999,0.160004,-0.064999,...,331.0,306.0,320.0,0.89415,1.1183,0.03702,27.011,31.0575,0.03219,2.18
75%,435.12751,439.550003,428.160004,434.425011,434.425011,11193150.0,3.672518,3.820007,1.667511,3.092484,...,445.0,416.0,431.0,0.9205,1.160175,0.03874,28.53,33.52475,0.034838,3.48
max,692.349976,700.98999,686.090027,691.690002,691.690002,133387500.0,84.570038,64.419983,63.649994,55.030029,...,692.0,663.0,680.0,1.0421,1.2511,0.06355,40.539,44.054,0.05659,9.06


In [8]:
full_set[full_set['open-prev_close'] == full_set['open-prev_close'].min()]

Unnamed: 0_level_0,open,high,low,close,adj_close,volume,diff_close_value,diff_open_value,open-prev_close,(close-open)_lag_1,...,close_max_7_days,close_min_7_days,close_mean_7_days,1_USD_to_EUR_lag_1,1_EUR_to_USD_lag_1,1_UAH_to_USD_lag_1,1_USD_to_UAH_lag_1,1_EUR_to_UAH_lag_1,1_UAH_to_EUR_lag_1,usa_inflation_%
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-21,400.429993,409.149994,379.98999,397.5,397.5,58904300.0,-110.75,-117.320007,-107.820007,-9.5,...,541.0,508.0,523.0,0.8839,1.1313,0.0353,28.327,32.047,0.0312,7.04


In [9]:
full_set[~((full_set['diff_close_value'] < -15) | (full_set['diff_close_value'] > 15))] \
.mean()[['diff_close_value', 'open-prev_close', '(close-open)_lag_1']]

diff_close_value      0.176222
open-prev_close       0.204648
(close-open)_lag_1    0.098843
dtype: float64

In [10]:
full_set[~((full_set['open-prev_close'] < -10) | (full_set['open-prev_close'] > 10))] \
.mean()[['diff_close_value', 'open-prev_close', '(close-open)_lag_1']]

diff_close_value      0.261529
open-prev_close       0.221795
(close-open)_lag_1    0.101084
dtype: float64

In [11]:
full_set[~((full_set['(close-open)_lag_1'] < -15) | (full_set['(close-open)_lag_1'] > 15))] \
.mean()[['diff_close_value', 'open-prev_close', '(close-open)_lag_1']]

diff_close_value      0.340053
open-prev_close       0.240805
(close-open)_lag_1    0.037964
dtype: float64

In [12]:
full_set.new_open

date
2015-01-02     49.095712
2015-01-05     49.545237
2015-01-06     49.440000
2015-01-07     47.909046
2015-01-08     47.948570
                 ...    
2024-06-03    645.023336
2024-06-04    645.583293
2024-06-05    640.360026
2024-06-06    637.350016
2024-06-07    651.953369
Name: new_open, Length: 2374, dtype: float64