In [None]:
!wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
!tar -xzvf ta-lib-0.4.0-src.tar.gz
%cd ta-lib
!./configure --prefix=/usr
!make
!make install
!pip install Ta-Lib

In [None]:
cd ..

In [None]:
!pip install yfinance
!pip install phik

In [None]:
import talib as ta
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt

In [None]:
def load_data(filepath = 'fundamentals.csv'):
    
    #read data
    raw_data = pd.read_csv(filepath, low_memory = False);
    
    #get the keys (indices)
    indices = raw_data.keys()
    
    #rename the columns to features
    raw_data.rename(columns = raw_data.iloc[0], inplace = True)
    
    #drop columns from data values
    raw_data.drop([0], inplace = True)
    
    #set dates as index
    raw_data.set_index("Dates", inplace = True)
    
    #extract list of indices
    indices = indices.to_list()
    indices = indices[1:]
    
    final_indices = []
    
    for i in range(len(indices)):
        if i%23 == 0:
            final_indices.append(indices[i])
     
    columns = []
    for i in raw_data.keys():
        columns.append(i)
    
    #make the columns unique
    temp_columns = columns.copy()
    for i in range(len(columns)):
        temp_columns[i] = temp_columns[i] + " "+ final_indices[int(i/23)]
    
    #drop first row
    raw_data.drop("01-01-2010", inplace = True)
    
    #rename columns
    raw_data.columns = temp_columns
    
    #get list of features
    final_columns = [];
    [final_columns.append(x) for x in columns if x not in final_columns];
    
    #make a 3d dataframe
    final_dict = {}
    for i in range(len(raw_data.keys())):
        final_dict[final_indices[int(i/23)], final_columns[i%23]] = raw_data[raw_data.keys()[i]]
        
    final_raw_data = pd.DataFrame(final_dict)
    
    #get all the indices where all data is present
    temp_dict = {}
    for outer_key in final_indices:
        if len(final_raw_data[outer_key].dropna(axis = 1).keys()) == 23:
            for inner_key in final_raw_data[outer_key].keys():
                temp_dict[outer_key, inner_key] = final_raw_data[outer_key][inner_key]
    
    final_df = pd.DataFrame(temp_dict)
    
    return final_df

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = load_data('/content/drive/MyDrive/IAQF/fundamentals.csv')
data

In [None]:
#Taking only 680 days of data
"""
data = data[-680:-1].copy()
data
"""

In [None]:
indices = list(data.columns.get_level_values(0).unique())
print(indices)

In [None]:
data.rename(columns = {'PX_OPEN': 'Open', 'PX_LOW' : 'Low', 'PX_LAST' : 'Close', 'PX_HIGH' : 'High','PX_VOLUME' : 'Volume'}, inplace = True)
data

In [None]:
data.loc[:,('SPX Index', 'Close')] = data.loc[:,('SPX Index', 'Close')].astype(np.float32)
data.loc[:,('SPX Index', 'Close')]

In [None]:
index_dict = {}
for index in indices :
  df = data[index].copy()
  df = df.astype(np.float32)
  df.dropna(inplace = True, axis = 0)
  df['RSI'] = ta.RSI(df['Close'])
  df['MFI'] = ta.MFI(df['High'],df['Low'],df['Close'], df['Volume'])
  df['ADX'] = ta.MFI(df['High'],df['Low'],df['Close'], df['Volume'])
  df['OBV'] = ta.OBV(df['Close'], df['Volume'])
  df['ATR'] = ta.ATR(df['High'], df['Low'], df['Close'])
  df['Boll_upper'], df['Boll_mid'], df['Boll_lower'] = ta.BBANDS(df['Close'], timeperiod = 20)
  df['EMA'] = ta.EMA(df['Close'], timeperiod = 14)
  df['MACD'],_,_ = ta.MACD(df['Close'],fastperiod = 14, slowperiod = 30)
  for i in range(1,49) :
    df['LR_' + str(i)] = np.log(df['Close']) - np.log(df['Close'].shift(i))
  for col in df.columns:
    df[col] = (df[col] - df[col].min())/(df[col].max() - df[col].min())
  df.dropna(inplace = True, axis = 0)
  index_dict[index] = df

In [None]:
index_dict[indices[0]]['PX_TO_BOOK_RATIO']

In [None]:
index_keys = list(index_dict.keys())
for index in index_keys:
  if (index_dict[index].shape[0] == 0):
    del(index_dict[index])

In [None]:
for index in index_dict:
  print(index_dict[index].shape)

In [None]:
indices = list(index_dict.keys())
indices

In [None]:
reformed_dict = {}
for outerKey, innerDict in index_dict.items():
    for innerKey, values in innerDict.items():
        reformed_dict[(outerKey,
                       innerKey)] = values

In [None]:
data = pd.DataFrame(reformed_dict)
data.shape

In [None]:
data.isna()

In [None]:
from keras.layers import Conv2D, Conv2DTranspose, Dense, Flatten, Reshape
from keras.models import Sequential, Model
from keras.utils.vis_utils import plot_model
import numpy as np
import pandas as pd

In [None]:
import numpy as np
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score

nmi = normalized_mutual_info_score
ari = adjusted_rand_score

def acc(y_true, y_pred):
    """
    Calculate clustering accuracy. Require scikit-learn installed
    # Arguments
        y: true labels, numpy.array with shape `(n_samples,)`
        y_pred: predicted labels, numpy.array with shape `(n_samples,)`
    # Return
        accuracy, in [0,1]
    """
    y_true = y_true.astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    from sklearn.utils.linear_assignment_ import linear_assignment
    ind = linear_assignment(w.max() - w)
    return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size

In [None]:
args = {
        'n_clusters' : 10,
        'batch_size' : 32,
        'epochs' : 1000,
        'save_dir' : 'results/temp'
}
print(args)

In [None]:
import os
if not os.path.exists(args['save_dir']):
  os.makedirs(args['save_dir'])

In [None]:
# load dataset
#data = pd.read_csv('data_processed.csv', header=[0,1], index_col=0)
#data.dropna(inplace = True)

In [None]:
data.shape

In [None]:
print(f'No. of indices = {len(indices)}')
print(indices)

In [None]:
n_timesteps = data.shape[0]
n_features = data[indices[0]].shape[1]
n_indices = int(data.shape[1]/data[indices[0]].shape[1])
print(f'n_timesteps : {n_timesteps}, n_features : {n_features}, n_indices : {n_indices}')

In [None]:
x = data.values
x = x.reshape(-1, n_timesteps, n_features,1).astype('float32')

In [None]:
x.shape

In [None]:
x

In [None]:
print(data.columns.get_level_values(1).unique())

In [None]:
model = Sequential()
input_shape = x.shape[1:]
filters = [32, 64, 128, 10]

#encoding layer
if input_shape[0] % 8 == 0:
    pad3 = 'same'
else:
    pad3 = 'valid'
model.add(Conv2D(filters[0], 5, strides=1, padding='same', activation='relu', name='conv1', input_shape=input_shape))

model.add(Conv2D(filters[1], 5, strides=1, padding='same', activation='relu', name='conv2'))

model.add(Conv2D(filters[2], 3, strides=1, padding=pad3, activation='relu', name='conv3'))

model.add(Flatten(name = 'flatten'))
model.add(Dense(units=filters[3], name='embedding'))
model.add(Dense(units = model.get_layer(name='flatten').output_shape[-1], activation = 'relu'))
model.summary()

In [None]:
#decoding layers

model.add(Reshape((input_shape[0]-2, input_shape[1] - 2, filters[2])))

model.add(Conv2DTranspose(filters[1], 3, strides=1, padding=pad3, activation='relu', name='deconv3'))

model.add(Conv2DTranspose(filters[0], 5, strides=1, padding='same', activation='relu', name='deconv2'))

model.add(Conv2DTranspose(input_shape[2], 5, strides=1, padding='same', name='deconv1'))
model.summary()

In [None]:
# define the model
plot_model(model, to_file=args['save_dir'] + '/%s-pretrain-model.png' % 'data', show_shapes=True)
model.summary()

In [None]:
# compile the model and callbacks
optimizer = 'adam'
model.compile(optimizer=optimizer, loss='mse')
from keras.callbacks import CSVLogger
csv_logger = CSVLogger(args['save_dir'] + '/%s-pretrain-log.csv' % 'data')

In [None]:
import tensorflow as tf
early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'loss', patience = 5)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath = 'best_model.h5', monitor = 'loss', save_best_only = True)

In [None]:
# begin training
from time import time
t0 = time()
history = model.fit(x, x, batch_size=args['batch_size'], epochs=args['epochs'],verbose = 1, callbacks=[model_checkpoint,csv_logger)
print('Training time: ', time() - t0)
model.save(args['save_dir'] + '/%s-pretrain-model-%d.h5' % ('data', args['epochs']))

In [None]:
import matplotlib.pyplot as plt
training_loss = history.history['loss']
epoch_count = range(1, len(training_loss) + 1)
plt.figure(figsize = (25,5))
plt.plot(epoch_count[2:], training_loss[2:], "r--")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(["Training Loss"])
plt.show()

In [None]:
# extract features
feature_model = Model(inputs=model.input, outputs=model.get_layer(name='embedding').output)
features = feature_model.predict(x)
print('feature shape=', features.shape)

In [None]:
from sklearn.cluster import AgglomerativeClustering
Agg_Clustering = AgglomerativeClustering(n_clusters = 10)
features = np.reshape(features, newshape = (features.shape[0], -1))
pred = Agg_Clustering.fit_predict(features)

In [None]:
pred

In [None]:
indices = np.array(indices)
indices.shape

In [None]:
clustered_data = np.vstack((indices, pred)).T

In [None]:
cluster_df = pd.DataFrame(clustered_data, columns = ['Index','Cluster'])
cluster_df['Cluster'] = cluster_df['Cluster'].astype(np.int32)
cluster_df

In [None]:
cluster_df.to_csv('cluster_Index_data_latest.csv')

In [None]:
index_clusters = {}
for i in range(len(cluster_df)):
  if cluster_df['Cluster'].iloc[i] in stock_clusters:
    stock_clusters[cluster_df['Cluster'].iloc[i]].append(cluster_df['Index'].iloc[i])
  else:
    stock_clusters[cluster_df['Cluster'].iloc[i]] = [cluster_df['Index'].iloc[i]]

In [None]:
categories = cluster_df['Cluster'].unique()
categories.sort()
for i in categories:
  print(f'Cluster {i} : {stock_clusters[i]}')
  print('\n')

In [None]:
data = load_data()

In [None]:
import matplotlib.dates as mdates

s1 = 'NDQ Index'
s2 = 'SBF120 Index'

s1 = data[s1]['PX_LAST'][-360*5:].copy()
s1 = s1.astype(np.float32)
s2 = data[s2]['PX_LAST'][-360*5:].copy()
s2 = s2.astype(np.float32)

s1 = (s1 - s1.min())/(s1.max() - s1.min())
s2 = (s2 - s2.min())/(s2.max() - s2.min())

fig, ax = plt.subplots(figsize = (25,5))

ax.plot(s1, label = s1)
ax.plot(s2, label = s2)

start, end = ax.get_xlim()
ax.xaxis.set_ticks(np.arange(start, end, 180))


plt.ylabel('Close Price')
plt.xlabel('Date')
plt.show()