## Initialize and Import Libraries

In [None]:
import warnings
import pandas as pd 
import numpy as np
import tensorflow as tf

import time, sys
from IPython.display import clear_output

from tensorflow.keras import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM, Flatten
from tensorflow.keras.utils import normalize
from keras.utils.np_utils import to_categorical

from google.colab import drive
import os 
import pprint
from sklearn import metrics

warnings.filterwarnings("ignore")
#np.random.seed(7)
drive.mount('/content/gdrive')

In [None]:
def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
        
    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

## Load Data from Drive


In [None]:
chass_monthly_file_location = os.path.join("/content/gdrive/My Drive/1977monthly.csv")
data_extract_monthly = pd.read_csv(chass_monthly_file_location)
data_extract_monthly['datadate'] =  pd.to_datetime(data_extract_monthly['datadate'])

#### Constituent Mapping

In [None]:
tsx_constituents_file_location = os.path.join("/content/gdrive/My Drive/constituents_tsx.csv")
tsx_constituents = pd.read_csv(tsx_constituents_file_location)
tsx_constituents['from'] =  pd.to_datetime(tsx_constituents['from'], format='%m/%d/%Y')
tsx_constituents['thru'] =  pd.to_datetime(tsx_constituents['thru'], format='%m/%d/%Y')
tsx_constituents =tsx_constituents[tsx_constituents['conm'] == 'S&P/TSX Composite Index']

## Transform Data

### Map S&P TSX Composite Constituents and Create Time Series

#### Monthly Data 

In [None]:
temp1 = data_extract_monthly[data_extract_monthly['cusip'].isin(tsx_constituents['co_cusip'])]
temp2 = data_extract_monthly[data_extract_monthly['ticker'].isin(tsx_constituents['co_tic'])]
temp3 = pd.concat([temp1,temp2]).drop_duplicates().reset_index()
temp3['prc_mret']=temp3['prc_mret'].fillna(0)
temp = temp3[['ticker','datadate','prc_mret']].sort_values(['ticker','datadate','prc_mret']).set_index('datadate')

In [None]:
J = 12
temp['logret']=np.log(1+temp['prc_mret'])
umd = temp.groupby(['ticker'])['logret'].rolling(J, min_periods=J).sum()
umd = umd.reset_index()

In [None]:
umd['cumret']=np.exp(umd['logret'])-1
umd = umd.dropna(axis=0, subset=['cumret'])
umd = umd.reset_index()
umd = umd[umd['datadate'] >= '1982-01-01']

In [None]:
umd[umd['ticker'] == 'BCE']

In [None]:
temp_monthly = umd.pivot_table(index='datadate', columns='ticker', values='cumret')
temp_monthly = temp_monthly.fillna(0)
stock_list_monthly = temp_monthly.columns

In [None]:
len(stock_list_monthly)

### Create Time Series Constituents Matrix

#### Monthly

In [None]:
k=0
total = len(temp_monthly.columns)
constituents_monthly = pd.DataFrame(index=temp_monthly.index)
for stock in stock_list_monthly:
    stock_cusip = temp3[temp3['ticker'] == stock]['cusip'].unique()[0]
    
    stock_dates_from = tsx_constituents[tsx_constituents['co_cusip'] == stock_cusip]['from']
    if len(stock_dates_from.values) == 0:
        stock_dates_from = tsx_constituents[tsx_constituents['co_tic'] == stock]['from']
    stock_dates_thru = tsx_constituents[tsx_constituents['co_cusip'] == stock_cusip]['thru']
    if len(stock_dates_thru.values) == 0:
        stock_dates_thru = tsx_constituents[tsx_constituents['co_tic'] == stock]['thru']
    
    stock_dates_thru = stock_dates_thru.fillna(pd.to_datetime('today'))
    
    for i in range(0,len(stock_dates_from)):
        constituents_monthly[stock] = np.where(
            (temp_monthly.index >= pd.to_datetime(stock_dates_from.values[i]).strftime('%Y-%m-%d')) 
            & (temp_monthly.index <= pd.to_datetime(stock_dates_thru.values[i]).strftime('%Y-%m-%d')), 1, 0)
        
    k=k+1
    update_progress(k/total)
    
update_progress(1)

### Calculate Mean And Create Training Set


#### Monthly

In [None]:
dataset_monthly = temp_monthly.copy(deep=True)

In [None]:
median = (dataset_monthly*constituents_monthly).median(axis=1)
mean = (dataset_monthly*constituents_monthly).mean(axis=1)
std = (dataset_monthly*constituents_monthly).std(axis=1)

In [None]:
k=0
for c in stock_list_monthly:
    #label based on value relative to crosssectional median
    temp = np.where(dataset_monthly[c] >= median, 0, 1)
    dataset_monthly[c + '_out'] = np.append(temp[1:len(temp)],[0])
    k=k+1
    update_progress(k/ len(stock_list_monthly))

dataset_monthly = dataset_monthly.fillna(0)
dataset_monthly['median'] = median
dataset_monthly['mean'] = mean
dataset_monthly['std'] = std
update_progress(1)

## Train Data

### Parameters

In [None]:
train_size = .9
month_offset = 12
num_of_years = 20
look_back = 12
daily_length = 20
step = 1
study_period = look_back * num_of_years
num_of_rows, num_of_columns = dataset_monthly.shape
num_of_iterations = int(num_of_rows/6) - look_back - month_offset

### Input/Output Preparation Methods

#### Monthly Only Preparation

In [None]:
def data_prep(look_back,step,column_subset,trainset):
  X_s = np.empty((0,))
  y_s = np.empty((0))
    
  #k=0
  
  for stock in column_subset:
    timeseries = np.asarray(trainset[stock],dtype=np.float32)
    X = timeseries
    y_series = np.asarray(trainset[stock + '_out'])
    y = y_series
    test = len(X) -len(y)
    if test > 0:
      print(stock)
    X_s = np.append(X_s, X, axis=0)
    y_s = np.append(y_s, y, axis=0)
        
  X_in = X_s.astype(np.float32)
  y_in = y_s.astype(np.int32)
  return(X_in,y_in)

#### Merge Monthly with Daily

In [None]:
def data_prep(look_back,step,column_subset,trainset):
  X_s = np.empty((0, look_back+daily_length, step))
  y_s = np.empty((0, 2))
  x_daily = np.array([])
  y_daily = np.array([])
  X_out = np.empty(shape=(daily_length-1,look_back+daily_length,1))

  for stock in column_subset:
    
    #normalize trainset data 
    #trainset[stock] = (trainset[stock] - trainset[stock][:int(len(trainset))].mean())/trainset[stock][:int(len(trainset))].std()
    
    for i in range(daily_length-1):
      month_end_index = dataset_daily.index.get_loc(dataset_daily[dataset_daily.index == trainset[stock][(i+look_back+1):(i+1+look_back+1)].index[0].strftime('%Y-%m-%d')].index[0])
      lookback_index = month_end_index - daily_length
      x_daily = np.append(x_daily,np.asarray(dataset_daily.iloc[lookback_index:month_end_index][stock],dtype=np.float32))

    timeseries = np.asarray(trainset[stock],dtype=np.float32)
    x_daily = np.atleast_2d(x_daily)
    timeseries = np.atleast_2d(timeseries)

    if timeseries.shape[0] == 1:
      timeseries = timeseries.T

    X = np.atleast_3d(np.array([timeseries[start:start + look_back] for start in range(0, timeseries.shape[0] - look_back)]))
    

    if x_daily.shape[0] == 1:
      x_daily = x_daily.T

    y_series = np.asarray(trainset[stock + '_out'])
    y = y_series[look_back:len(y_series)-1]
    y = to_categorical(y)

    for i in range(daily_length-1):
      X_out[i] = np.concatenate((X[i],x_daily[(i*daily_length):((i*daily_length)+daily_length)]), axis=0)
      
    X_s = np.append(X_s, X_out, axis=0)
    y_s = np.append(y_s, y, axis=0)

  X_in = X_s.astype(np.float32)
  y_in = y_s.astype(np.int32)
  return(normalize(X_in),y_in)

### Train and Test

In [None]:
from sklearn.linear_model import LogisticRegression

j = 0
i = 0
num_of_iterations = 18
while i < num_of_iterations:
  
    print("On Iteration " + str(i) + " of " + str(num_of_iterations))
  
    datasubset = dataset_monthly[i*month_offset:(i*month_offset)+study_period].copy(deep=True)
    trainset = datasubset[:int(len(datasubset) * train_size)].copy(deep=True)
    testset = datasubset[int(len(datasubset) * train_size):].copy(deep=True)
    
    column_subset = constituents_monthly.loc[testset.index[len(testset.index)-13]]
    column_subset = column_subset[column_subset == 1].index
    
    X_in,y_in = data_prep(look_back,step,column_subset,trainset)
    clf = LogisticRegression(solver='lbfgs').fit(X_in.reshape(-1,1), y_in)
    
    s = 0

    for c in column_subset:
      testset[c + '_dn'] = 0.0000000
      testset[c + '_up'] = 0.0000000
      for k in range(len(testset.index)):
            
        if k >= look_back:
          b = np.asarray(testset[c]).reshape(-1,1)
          yp = clf.predict_proba(b)
          testset.loc[testset.index[k], c + '_up'] = yp[0][0]
          testset.loc[testset.index[k], c + '_dn'] = yp[0][1]
          testset[c + '_out_pred'] = np.where(testset[c +'_up'] >= testset[c + '_dn'], 0, 1)
          
      s = s +1
      update_progress(s/len(column_subset))
          
    result_file_name = "/content/gdrive/My Drive/temp_out_logit1/" + str(i) + "results.csv"
    testset.to_csv(result_file_name)
    
    i = i + 1
    update_progress(i / num_of_iterations)
    del datasubset

update_progress(1)

## Verification Code (corrected)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from matplotlib import pyplot as plt
results_extract = pd.DataFrame()

In [None]:
def prepare_output(id_vars, results_extract,dataset_monthly,i):
  y_pred = np.array(0)
  y_actual = np.array(0)
  y_scores = np.array(0)

  for stock in id_vars:
    test = results_extract[results_extract['variable'] == stock].set_index('datadate').sort_index()
    test2 = dataset_monthly[[stock +'_out']]
    if len(test) >0:
      min_date = min(test.index)
      test2 = test2[test2.index >= min_date]
      test2 = test2.iloc[:len(test)]
      y_actual = np.append(y_actual, np.asarray(test2[stock +'_out']))
      y_pred = np.append(y_pred, np.asarray(test['pred']))
      #the dn variable ended up being the positive predictor
      y_scores = np.append(y_scores, np.asarray(test['dn']))

  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  #since classes are reverse here with 0 being positive and 1 being negative
  fpr, tpr, thresholds = metrics.roc_curve(y_actual, y_scores,pos_label=1)

  plt.subplot(4, 5, i)
  plt.plot([0, 1], [0, 1], 'k--')
  plt.plot(fpr, tpr, label='LR')
  plt.xlabel('False positive rate')
  plt.ylabel('True positive rate')
  plt.title('ROC curve for ' + min_date.split(sep='-')[0])
  plt.legend(loc='best')
  return (min_date.split(sep='-')[0],tn/(tn+fn))

In [None]:
k = 0
i = 0
iterations = 18
j = 0
time_frame = 12
offset = 12
plt.figure(figsize=(20,20))
accuracy_df = pd.DataFrame()

for i in range(j,iterations):
  csv_name = os.path.join("/content/gdrive/My Drive/temp_out_logit1/" + str(i) + "results.csv")
  df = pd.read_csv(csv_name)
  df = df[offset:]
  print(df['datadate'])
  id_vars = [x for x in df.columns if ('_ismember' not in x) 
             and ('_out' not in x) 
             and ('_dn' not in x) 
             and ('_up' not in x) 
             and ('mean' not in x)
             and ('std' not in x)
             and ('median' not in x)
             and ('datadate' not in x)
            ]
  value_vars = [x for x in df.columns if ('_out' in x) or ('_dn' in x) ]

  result = pd.DataFrame()
  for stock in id_vars:
    id_varss = ['datadate']
    temp = [x for x in value_vars if (stock+'_dn') == x or (stock+'_out_pred'==x)]
    if len(temp) >0:
      id_varss = id_varss + temp
      #print(id_varss)
      temp = pd.melt(df, id_vars=id_varss, value_vars=[stock])
      temp.rename(columns={stock+'_dn':'dn', stock+'_out_pred':'pred'}, inplace=True)
      result = pd.concat([result,temp])

  accuracy = prepare_output(id_vars,result,dataset_monthly,i+1)
  print(accuracy)
  temp_df = pd.DataFrame([[accuracy[0],accuracy[1]]],columns=['year','accuracy'])
  accuracy_df = accuracy_df.append(temp_df)

  k = k+1
  update_progress(k/iterations)
  #results_extract = pd.concat([results_extract,result])
  #accuracy = prepare_output(id_vars,result,dataset_monthly)

In [None]:
accuracy_df