# This script tries to study correlations between stocks based on their trade price movements
# Shows around 20 stocks have high positive correlation

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


data_folder = '/kaggle/input/optiver-realized-volatility-prediction'
import os
os.listdir(data_folder)
train = pd.read_csv(data_folder+r'/train.csv')
train.head()

In [None]:
try:
    os.mkdir('/kaggle/working/processed_data')
except:
    pass
try:
    os.mkdir('/kaggle/working/processed_data/trade_price')
except:
    pass

In [None]:
os.listdir('/kaggle/working')

In [None]:
def avg_price(df):
    ''' Calculate avg traded price for a dataframe'''
    
    df['pricexsize'] = df['price']*df['size']
    return df['pricexsize'].sum()/df['size'].sum()

In [None]:

def get_trade(stock_id, time_id=''):
    ''' Get trade data for a stock'''
    trade_example =  pd.read_parquet(data_folder+r'/trade_train.parquet/stock_id='+str(stock_id))
    if time_id:
        trade_example = trade_example[trade_example['time_id']==time_id]
        
    return trade_example

In [None]:
# This cell takes around 45 mins to run
# We can use dask to make this computation faster leveraging multiple cores

from tqdm import tqdm

# Read data for each stock and write the avg traded price for each 100 second bin
bins=np.arange(0,600,100)
for sid in tqdm(train.stock_id.unique()):
    test_df = get_trade(stock_id=sid)
    idx = np.digitize(test_df['seconds_in_bucket'], bins)
    full_timeseries = test_df.groupby(['time_id',idx]).apply(avg_price).reset_index()
    full_timeseries.to_csv('/kaggle/working/processed_data/trade_price'+'/'+str(sid)+'.csv')

In [None]:
os.listdir('/kaggle/working/processed_data/trade_price')

In [None]:
# Create final dataframe with all timeseries in columns

tfolder='/kaggle/working/processed_data/trade_price'

finaldf = pd.DataFrame()

for filen in os.listdir(tfolder):
    print(tfolder+'/'+filen)
    
    tmpdf = pd.read_csv(tfolder+'/'+filen)
    tcol = filen[:-4]
    finaldf[tcol] = tmpdf['0']


data = finaldf
data

In [None]:
correlations = finaldf.corr()
correlations

In [None]:
# Minimum correlation
correlations.min().min()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.heatmap(round(correlations,2), cmap='RdBu', vmin=-1, vmax=1)

In [None]:
# Hierarchical clustering on correlations data
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform

plt.figure(figsize=(12,5))
dissimilarity = 1 - abs(correlations)
Z = linkage(squareform(dissimilarity), 'complete')

dendrogram(Z, labels=finaldf.columns, orientation='top', 
           leaf_rotation=90);



In [None]:
# Clusterize the data
threshold = 0.8
labels = fcluster(Z, threshold, criterion='distance')

# Show the cluster
labels

In [None]:
import numpy as np

# Keep the indices to sort labels
labels_order = np.argsort(labels)

# Build a new dataframe with the sorted columns
for idx, i in enumerate(finaldf.columns[labels_order]):
    if idx == 0:
        clustered = pd.DataFrame(finaldf[i])
    else:
        df_to_append = pd.DataFrame(finaldf[i])
        clustered = pd.concat([clustered, df_to_append], axis=1)

In [None]:
plt.figure(figsize=(15,10))
correlations = clustered.corr()
sns.heatmap(round(correlations,2), cmap='RdBu',vmin=-0.15, vmax=1);