In [22]:
# import dependencies
%matplotlib agg

import pandas as pd
import numpy as np
from numpy import linalg
from sklearn import metrics
from model import BayesianLearning
import networkx as nx
import matplotlib.pyplot as plt

In [23]:
# set path
input_folder = '../Data Set/'
output_folder = "../Results/"

In [24]:
# import data
closingPrices = pd.read_csv(input_folder+'ClosingPrices.csv', index_col=0)

### PRE PROCESSING

In [25]:
# data type
closingPrices.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2597 entries, 2010-11-04 to 2021-04-29
Data columns (total 49 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ADANIPORTS  2597 non-null   float64
 1   ASIANPAINT  2597 non-null   float64
 2   AXISBANK    2597 non-null   float64
 3   BAJAJ-AUTO  2597 non-null   float64
 4   BAJAJFINSV  2597 non-null   float64
 5   BAJFINANCE  2597 non-null   float64
 6   BHARTIARTL  2597 non-null   float64
 7   BPCL        2597 non-null   float64
 8   BRITANNIA   2597 non-null   float64
 9   CIPLA       2597 non-null   float64
 10  COALINDIA   2597 non-null   float64
 11  DRREDDY     2597 non-null   float64
 12  EICHERMOT   2597 non-null   float64
 13  GAIL        2597 non-null   float64
 14  GRASIM      2597 non-null   float64
 15  HCLTECH     2597 non-null   float64
 16  HDFC        2597 non-null   float64
 17  HDFCBANK    2597 non-null   float64
 18  HEROMOTOCO  2597 non-null   float64
 19  HINDALCO    2597 

In [26]:
# ensuring no ticker conflicts exists
print(closingPrices.isnull().sum())

ADANIPORTS    0
ASIANPAINT    0
AXISBANK      0
BAJAJ-AUTO    0
BAJAJFINSV    0
BAJFINANCE    0
BHARTIARTL    0
BPCL          0
BRITANNIA     0
CIPLA         0
COALINDIA     0
DRREDDY       0
EICHERMOT     0
GAIL          0
GRASIM        0
HCLTECH       0
HDFC          0
HDFCBANK      0
HEROMOTOCO    0
HINDALCO      0
HINDUNILVR    0
ICICIBANK     0
INDUSINDBK    0
INFY          0
IOC           0
ITC           0
JSWSTEEL      0
KOTAKBANK     0
LT            0
MARUTI        0
M&M           0
NESTLEIND     0
NTPC          0
ONGC          0
POWERGRID     0
RELIANCE      0
SBIN          0
SHREECEM      0
SUNPHARMA     0
TATAMOTORS    0
TATASTEEL     0
TCS           0
TECHM         0
TITAN         0
ULTRACEMCO    0
UPL           0
VEDL          0
WIPRO         0
ZEEL          0
dtype: int64


In [27]:
# apply two level tesla discretization
for x in closingPrices.columns:
    median_value = closingPrices[x].median()
    closingPrices[x] = closingPrices[x].apply(lambda x: 0 if x < median_value else 1)


### TEST TRAIN SPLIT

In [28]:
train_data = closingPrices.loc['2011-01-01':'2019-12-31',]
test_data = closingPrices.loc['2020-01-01':'2020-12-31',]

In [29]:
train_data.head()

Unnamed: 0_level_0,ADANIPORTS,ASIANPAINT,AXISBANK,BAJAJ-AUTO,BAJAJFINSV,BAJFINANCE,BHARTIARTL,BPCL,BRITANNIA,CIPLA,...,TATAMOTORS,TATASTEEL,TCS,TECHM,TITAN,ULTRACEMCO,UPL,VEDL,WIPRO,ZEEL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-03,0,1,1,0,0,0,1,1,0,0,...,1,1,0,1,1,0,0,1,1,0
2011-01-04,0,1,1,0,0,0,1,1,0,0,...,1,1,0,1,1,0,0,1,1,0
2011-01-05,0,1,1,0,0,0,0,1,0,0,...,1,1,0,1,1,0,0,1,1,0
2011-01-06,0,1,1,0,0,0,0,1,0,0,...,1,1,0,1,1,0,0,1,1,0
2011-01-07,0,1,1,0,0,0,0,1,0,0,...,1,1,0,1,1,0,0,1,1,0


In [30]:
test_data.head()

Unnamed: 0_level_0,ADANIPORTS,ASIANPAINT,AXISBANK,BAJAJ-AUTO,BAJAJFINSV,BAJFINANCE,BHARTIARTL,BPCL,BRITANNIA,CIPLA,...,TATAMOTORS,TATASTEEL,TCS,TECHM,TITAN,ULTRACEMCO,UPL,VEDL,WIPRO,ZEEL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01,1,1,1,1,1,1,1,0,1,0,...,0,1,0,1,1,1,1,0,0,1
2020-01-02,1,1,1,1,1,1,1,0,1,0,...,0,1,0,1,1,1,1,0,0,1
2020-01-03,1,1,1,1,1,1,1,0,1,0,...,0,1,1,1,1,1,1,0,0,0
2020-01-06,1,1,1,1,1,1,1,0,1,0,...,0,1,1,1,1,1,1,0,0,0
2020-01-07,1,1,1,1,1,1,1,0,1,0,...,0,1,1,1,1,1,1,0,0,0


### LEARNING

In [31]:
model = BayesianLearning()
model.train(train_data)

### TESTING

In [32]:
predicted_data = model.predict(test_data)

In [33]:
# Checking the no of deviation based on the no of 1's and 0's
print( 'Stock, Training, Prediction, Testing')
for i in range(len(list(predicted_data.columns))):
    print(predicted_data.columns[i], sum(train_data[predicted_data.columns[i]]), sum(predicted_data[predicted_data.columns[i]]), sum(test_data[test_data.columns[i]]))

Stock, Training, Prediction, Testing
ADANIPORTS 1001 14 219
ASIANPAINT 928 252 252
AXISBANK 1137 173 48
BAJAJ-AUTO 1003 63 217
BAJAJFINSV 968 231 252
BAJFINANCE 972 0 248
BHARTIARTL 967 252 252
BPCL 1258 79 1
BRITANNIA 984 86 236
CIPLA 1036 0 184
COALINDIA 1261 105 0
DRREDDY 969 0 251
EICHERMOT 1182 0 117
GAIL 1259 183 0
GRASIM 1282 9 0
HCLTECH 1191 0 29
HDFC 968 131 252
HDFCBANK 1051 176 129
HEROMOTOCO 1092 0 128
HINDALCO 1019 222 163
HINDUNILVR 968 136 252
ICICIBANK 1083 241 97
INDUSINDBK 1155 0 74
INFY 1176 212 5
IOC 1259 131 0
ITC 1299 0 0
JSWSTEEL 1258 55 0
KOTAKBANK 968 252 252
LT 1212 132 0
MARUTI 969 2 251
M&M 1289 0 0
NESTLEIND 968 90 252
NTPC 1263 123 0
ONGC 1259 113 0
POWERGRID 969 206 251
RELIANCE 934 209 248
SBIN 1161 252 39
SHREECEM 968 90 252
SUNPHARMA 1210 252 4
TATAMOTORS 1255 32 0
TATASTEEL 1098 189 82
TCS 1079 0 141
TECHM 1060 252 148
TITAN 928 252 252
ULTRACEMCO 972 152 248
UPL 1077 21 143
VEDL 1212 184 0
WIPRO 1195 82 0
ZEEL 1291 252 3


### EVALUATION

In [34]:
# Euclidian norm difference for a 2-D matrix
print(linalg.norm(predicted_data))
print(linalg.norm(test_data))

76.7333043730035
77.27871634544663


In [35]:
# Euclidian norm for individual stocks
print("Stocks, Norm of original data, Norm of predicted data")
for i in range(len(list(test_data.columns))):
    print(test_data.columns[i], linalg.norm(test_data[test_data.columns[i]]),linalg.norm(predicted_data[test_data.columns[i]]))

Stocks, Norm of original data, Norm of predicted data
ADANIPORTS 14.798648586948742 3.7416573867739413
ASIANPAINT 15.874507866387544 15.874507866387544
AXISBANK 6.928203230275509 13.152946437965905
BAJAJ-AUTO 14.730919862656235 7.937253933193772
BAJAJFINSV 15.874507866387544 15.198684153570664
BAJFINANCE 15.748015748023622 0.0
BHARTIARTL 15.874507866387544 15.874507866387544
BPCL 1.0 8.888194417315589
BRITANNIA 15.362291495737216 9.273618495495704
CIPLA 13.564659966250536 0.0
COALINDIA 0.0 10.246950765959598
DRREDDY 15.84297951775486 0.0
EICHERMOT 10.816653826391969 0.0
GAIL 0.0 13.527749258468683
GRASIM 0.0 3.0
HCLTECH 5.385164807134504 0.0
HDFC 15.874507866387544 11.445523142259598
HDFCBANK 11.357816691600547 13.2664991614216
HEROMOTOCO 11.313708498984761 0.0
HINDALCO 12.767145334803704 14.89966442575134
HINDUNILVR 15.874507866387544 11.661903789690601
ICICIBANK 9.848857801796104 15.524174696260024
INDUSINDBK 8.602325267042627 0.0
INFY 2.23606797749979 14.560219778561036
IOC 0.0 11.4

In [36]:
# F1 score
print(metrics.f1_score(test_data.to_numpy().flatten(),predicted_data.to_numpy().flatten()))

0.49106239460371


In [37]:
# F1 score for eack stock
print("Stocks, F1 score")
for i in range(len(list(test_data.columns))):
    print(test_data.columns[i], metrics.f1_score(test_data[test_data.columns[i]],predicted_data[test_data.columns[i]]))

Stocks, F1 score
ADANIPORTS 0.12017167381974247
ASIANPAINT 1.0
AXISBANK 0.0
BAJAJ-AUTO 0.35714285714285715
BAJAJFINSV 0.9565217391304348
BAJFINANCE 0.0
BHARTIARTL 1.0
BPCL 0.0
BRITANNIA 0.43478260869565216
CIPLA 0.0
COALINDIA 0.0
DRREDDY 0.0
EICHERMOT 0.0
GAIL 0.0
GRASIM 0.0
HCLTECH 0.0
HDFC 0.6840731070496084
HDFCBANK 0.43934426229508206
HEROMOTOCO 0.0
HINDALCO 0.6909090909090908
HINDUNILVR 0.7010309278350516
ICICIBANK 0.5088757396449703
INDUSINDBK 0.0
INFY 0.04608294930875576
IOC 0.0
ITC 0.0
JSWSTEEL 0.0
KOTAKBANK 1.0
LT 0.0
MARUTI 0.015810276679841896
M&M 0.0
NESTLEIND 0.5263157894736842
NTPC 0.0
ONGC 0.0
POWERGRID 0.8971553610503282
RELIANCE 0.8971553610503283
SBIN 0.2680412371134021
SHREECEM 0.5263157894736842
SUNPHARMA 0.03125
TATAMOTORS 0.0
TATASTEEL 0.19188191881918817
TCS 0.0
TECHM 0.74
TITAN 1.0
ULTRACEMCO 0.74
UPL 0.25609756097560976
VEDL 0.0
WIPRO 0.0
ZEEL 0.023529411764705882


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


### Probabilities obtained

In [38]:
model.save_dependencies("dependency_matrix.csv", output_folder)

### GRAPH GENERATION

In [39]:
input_data = pd.read_csv(output_folder+'dependency_matrix.csv', index_col=0)
G = nx.DiGraph(input_data.values)

In [40]:
m = np.array(input_data*100-np.identity(49)*100, dtype=int)
labels = input_data.columns

In [41]:
A2 = pd.DataFrame(m, index=labels, columns=labels)
nx.draw_networkx(nx.from_pandas_adjacency(A2))
plt.savefig(output_folder+"probabilities.png")
plt.show()

  plt.show()


In [42]:
array = np.array(input_data.copy())
array[array != 0] = 1
new_array = np.zeros_like(array)
plt.clf()

for i in range(len(new_array)):
    if(i!=0):
        new_array[:,i-1] = 0
    new_array[:, i] = array[:, i]

    A = pd.DataFrame(new_array.T, index=labels, columns=labels)
    G = nx.from_pandas_adjacency(A)
    isolated_nodes = list(nx.isolates(G))
    G.remove_nodes_from(isolated_nodes)
    
    pos = nx.spring_layout(G)
    edge_labels = {(u, v): f"{input_data.loc[u, v]:.2f}" for u, v in G.edges()}
    
    nx.draw_networkx(G, pos, with_labels=True, font_weight='bold')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

    plt.savefig(output_folder+"dependencies/"+labels[i]+".png")
    plt.clf()
print()


