In [1]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import itertools

In [2]:
from bs4 import BeautifulSoup as BS
import requests

In [3]:
import warnings
warnings.simplefilter(action="ignore")

In [4]:
companies = ["AMAZON", "APPLE", "GOOGLE", "META", "NETFLIX"]
    
URLS = ['https://finance.yahoo.com/quote/AMZN', 'https://finance.yahoo.com/quote/AAPL',
        'https://finance.yahoo.com/quote/GOOG', 'https://finance.yahoo.com/quote/META', 
        'https://finance.yahoo.com/quote/NFLX']

time_period = ["daily"]

In [5]:
for n, (company, period) in enumerate(list(itertools.product(companies, time_period))):
    
    date_parse = lambda dates: pd.to_datetime(dates)
    path = f"/diploma_info/datalake/raw_data/{company}_{period}.csv"

    full_set = pd.read_csv(
        path,
        parse_dates=["Date"],
        date_parser=date_parse,
        index_col=["Date"],
    )
    
    date_1 = full_set.index[-1]
    if (date_1+timedelta(days=2)).weekday() == 5 or \
       (date_1+timedelta(days=2)).weekday() == 6:
        date_2 = date_1 + timedelta(days=4)
    else:
        date_2 = date_1 + timedelta(days=2)
        
    new_dates = pd.date_range(date_1, date_2, freq='B')[1:]
    full_set = pd.concat([full_set, pd.DataFrame(None, index=new_dates)])
    
    full_set.index.name = 'date'
    full_set.columns = ["_".join(c.lower().split()) for c in full_set.columns]

    req_1 = requests.get(URLS[n])
    soup = BS(req_1.text, "html.parser")
    full_set.loc[date_2, 'open'] = float(soup.find_all('fin-streamer', class_="svelte-tx3nkj")[1].text.strip())
    full_set.loc[new_dates[0], 'open'] = full_set.shift(1).loc[new_dates[0], 'close']

        
    full_set['diff_close_value'] = full_set['close'].diff()
    full_set['diff_open_value'] = full_set['open'].diff()
    full_set['open-prev_close'] = full_set['open'] - full_set['close'].shift(1)
    full_set['(close-open)_lag_1'] = (full_set['close'] - full_set['open']).shift(1)
    
    full_set.loc[full_set[full_set['diff_close_value'] > 0].index, 'growth'] = 1
    full_set.loc[full_set[full_set['diff_open_value'] > 0].index, 'growth_open'] = 1
    full_set['growth_lag_1'] = full_set['growth'].shift(1).bfill()
    
    full_set[f'diff_open_value_mean_3_days'] = abs(full_set['diff_open_value'].rolling(window=3, closed='left').agg('mean'))
    full_set.loc[full_set['growth_open'] == 0, f'diff_open_value_mean_3_days'] = \
    -(full_set.loc[full_set['growth_open'] == 0, f'diff_open_value_mean_3_days'].values)
    full_set['new_open'] = full_set['open'].shift(1) + full_set['diff_open_value_mean_3_days']
    

    full_set['year'] = full_set.index.year
    full_set['month'] = full_set.index.month
    full_set['day'] = full_set.index.day
    full_set['day_of_week'] = full_set.index.weekday
    full_set['week_of_year'] = (full_set.index.isocalendar()['week']).astype('int')
    for i in range(1, 8):
        full_set[f'close_lag_{i}'] = full_set['close'].shift(i).bfill()
        full_set[f'volume_lag_{i}'] = full_set['volume'].shift(i).bfill()
        full_set[f'diff_close_value_lag_{i}'] = full_set['diff_close_value'].shift(i).bfill()
    full_set['growth_lag_7'] = full_set['growth'].shift(7).bfill()
    
    
    for window in [3, 5, 7]:
        close_agg = pd.DataFrame(round(full_set['close'].rolling(window=window, closed='left').agg(
            ('max', 'min', 'mean')
        )))
        close_agg.columns = [f'close_max_{window}_days', f'close_min_{window}_days', f'close_mean_{window}_days']
        
        full_set = full_set = pd.concat([full_set, close_agg], axis=1)
#         full_set = full_set.loc[:test_end.strftime("%Y%m%d"),]
      
        
    path_to_curr = f"/diploma_info/datalake/currency_rates.csv"
    currencies = pd.read_csv(
        path_to_curr,
        parse_dates=["date"],
        date_parser=date_parse,
        index_col=["date"],
    )
    
    currencies = pd.concat([currencies.dropna(), pd.DataFrame(None, index=new_dates)])
    currencies = currencies[~currencies.index.duplicated(keep='first')]
    currencies.index.name = 'date'
    
    for col in currencies.columns:
        currencies[f'{col}_lag_1'] = currencies[col].shift(1)
        currencies = currencies.drop([col], axis=1)
        
    full_set = pd.merge(full_set, currencies, on='date')    
    
    full_set = full_set.fillna(0)

    
    path_to_inf = f"/diploma_info/datalake/usa_inflation_rate.csv"
    inflation = pd.read_csv(
        path_to_inf,
        parse_dates=["date"],
        date_parser=date_parse,
        index_col=["date"],
    )
    for idx in inflation.index:
        full_set.loc[idx, 'usa_inflation_%'] = inflation.loc[idx, 'value']
    full_set = full_set.sort_index()

    full_set['usa_inflation_%'] = full_set['usa_inflation_%'].ffill()
    full_set = full_set.dropna().sort_index()   
        
    full_set.to_csv(f"/diploma_info/datalake/processed_data/{company}_{period}.csv")

In [6]:
full_set

Unnamed: 0_level_0,open,high,low,close,adj_close,volume,diff_close_value,diff_open_value,open-prev_close,(close-open)_lag_1,...,close_max_7_days,close_min_7_days,close_mean_7_days,1_USD_to_EUR_lag_1,1_EUR_to_USD_lag_1,1_UAH_to_USD_lag_1,1_USD_to_UAH_lag_1,1_EUR_to_UAH_lag_1,1_UAH_to_EUR_lag_1,usa_inflation_%
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,49.151428,50.331429,48.731430,49.848572,49.848572,13475000.0,1.047142,0.118572,0.349998,-0.231426,...,49.0,48.0,49.0,0.8265,1.2098,0.06322,15.819,19.138,0.05225,0.76
2015-01-05,49.258572,49.258572,47.147144,47.311428,47.311428,18165000.0,-2.537144,0.107143,-0.590000,0.697144,...,50.0,48.0,49.0,0.8370,1.1946,0.06265,15.962,18.900,0.05243,0.76
2015-01-06,47.347141,47.639999,45.661430,46.501431,46.501431,16037700.0,-0.809998,-1.911430,0.035713,-1.947144,...,50.0,47.0,49.0,0.8391,1.1918,0.06322,15.818,18.852,0.05305,0.76
2015-01-07,47.347141,47.421429,46.271427,46.742859,46.742859,9849700.0,0.241428,0.000000,0.845711,-0.845711,...,50.0,47.0,48.0,0.8377,1.1937,0.06317,15.830,18.896,0.05292,0.76
2015-01-08,47.119999,47.835712,46.478573,47.779999,47.779999,9601900.0,1.037140,-0.227142,0.377140,-0.604282,...,50.0,47.0,48.0,0.8461,1.1819,0.06317,15.830,18.709,0.05345,0.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-26,558.210022,562.919983,553.190002,561.229980,561.229980,4332600.0,-3.570007,8.750000,-6.589966,15.339966,...,614.0,555.0,576.0,0.9321,1.0729,0.02523,39.639,42.528,0.02351,3.48
2024-04-29,559.179993,559.640015,554.239990,559.489990,559.489990,2508900.0,-1.739990,0.969971,-2.049988,3.019958,...,611.0,555.0,568.0,0.9351,1.0695,0.02527,39.576,42.324,0.02363,3.48
2024-04-30,560.000000,560.000000,549.380005,550.640015,550.640015,3361500.0,-8.849976,0.820007,0.510010,0.309998,...,578.0,555.0,561.0,0.9329,1.0719,0.02521,39.660,42.512,0.02352,3.50
2024-05-01,550.640015,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,-9.359985,0.000000,-9.359985,...,578.0,551.0,561.0,0.9375,1.0667,0.02530,39.527,42.163,0.02372,3.50


In [7]:
full_set.describe()

Unnamed: 0,open,high,low,close,adj_close,volume,diff_close_value,diff_open_value,open-prev_close,(close-open)_lag_1,...,close_max_7_days,close_min_7_days,close_mean_7_days,1_USD_to_EUR_lag_1,1_EUR_to_USD_lag_1,1_UAH_to_USD_lag_1,1_USD_to_UAH_lag_1,1_EUR_to_UAH_lag_1,1_UAH_to_EUR_lag_1,usa_inflation_%
count,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0,...,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0,2349.0
mean,308.544346,312.685397,303.248915,308.075653,308.075653,9311146.0,0.213639,0.241012,0.186032,0.027509,...,317.868455,296.963814,307.44487,0.893463,1.121745,0.036404,28.226877,31.559917,0.032439,2.863155
std,163.156056,165.195843,160.996513,163.06833,163.06833,8137159.0,9.136253,8.992334,5.756448,6.888812,...,167.405086,158.485114,162.754207,0.042336,0.05283,0.005949,4.80599,4.818879,0.005084,2.367568
min,46.021427,0.0,0.0,0.0,0.0,0.0,-122.419983,-117.320007,-107.820007,-42.47998,...,0.0,0.0,0.0,0.7993,0.9596,0.02513,15.735,17.627,0.02337,-0.2
25%,149.800003,150.75,147.300003,150.089996,150.089996,4644200.0,-3.130005,-3.090004,-1.040009,-2.830002,...,153.0,144.0,149.0,0.8614,1.0867,0.03519,25.779,28.689,0.02992,1.36
50%,318.0,323.350006,311.839996,317.5,317.5,6974900.0,0.070007,0.120003,0.160004,-0.07,...,330.0,304.0,316.0,0.8939,1.1188,0.03705,26.99,30.989,0.03227,2.18
75%,428.980011,435.029999,422.540009,428.149994,428.149994,11221100.0,3.630005,3.790009,1.620026,3.019989,...,441.0,413.0,427.0,0.9203,1.161,0.03879,28.418,33.426,0.03486,3.5
max,692.349976,700.98999,686.090027,691.690002,691.690002,133387500.0,84.570038,64.529985,63.649994,55.030029,...,692.0,663.0,680.0,1.0421,1.2511,0.06355,39.795,42.794,0.05659,9.06


In [8]:
full_set[full_set['open-prev_close'] == full_set['open-prev_close'].min()]

Unnamed: 0_level_0,open,high,low,close,adj_close,volume,diff_close_value,diff_open_value,open-prev_close,(close-open)_lag_1,...,close_max_7_days,close_min_7_days,close_mean_7_days,1_USD_to_EUR_lag_1,1_EUR_to_USD_lag_1,1_UAH_to_USD_lag_1,1_USD_to_UAH_lag_1,1_EUR_to_UAH_lag_1,1_UAH_to_EUR_lag_1,usa_inflation_%
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-21,400.429993,409.149994,379.98999,397.5,397.5,58904300.0,-110.75,-117.320007,-107.820007,-9.5,...,541.0,508.0,523.0,0.8839,1.1313,0.0353,28.327,32.047,0.0312,7.04


In [9]:
full_set[~((full_set['diff_close_value'] < -15) | (full_set['diff_close_value'] > 15))] \
.mean()[['diff_close_value', 'open-prev_close', '(close-open)_lag_1']]

diff_close_value      0.158011
open-prev_close       0.198872
(close-open)_lag_1    0.070952
dtype: float64

In [10]:
full_set[~((full_set['open-prev_close'] < -10) | (full_set['open-prev_close'] > 10))] \
.mean()[['diff_close_value', 'open-prev_close', '(close-open)_lag_1']]

diff_close_value      0.220519
open-prev_close       0.214858
(close-open)_lag_1    0.067687
dtype: float64

In [11]:
full_set[~((full_set['(close-open)_lag_1'] < -15) | (full_set['(close-open)_lag_1'] > 15))] \
.mean()[['diff_close_value', 'open-prev_close', '(close-open)_lag_1']]

diff_close_value      0.307287
open-prev_close       0.231537
(close-open)_lag_1    0.026487
dtype: float64

In [12]:
full_set.new_open

date
2015-01-02     49.095712
2015-01-05     49.545237
2015-01-06     49.440000
2015-01-07     47.909046
2015-01-08     47.948570
                 ...    
2024-04-26    549.860006
2024-04-29    558.783366
2024-04-30    564.223328
2024-05-01    563.513326
2024-05-02    553.163350
Name: new_open, Length: 2349, dtype: float64