In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
import pickle
import datetime
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# Reading features by importance

In [2]:
features_by_importance = list(pd.read_csv('feature_counts_10d.csv',header=None)[0])
badf = pd.read_csv('data/badf.csv')

In [3]:
TARGET_COL = 'target_10d'
DATE_COL = 'date'
TARGET_THRESHOLD = 0.001

In [4]:
def badf_n_top_features(badf,features_by_importance,n,target_col):
    columns = ['date']
    columns.extend(features_by_importance[:n])
    columns.append(target_col)
    return badf.loc[:,columns].set_index('date')

def convert_target_labels(df,threshold,target_col):
    df.loc[df[target_col] > threshold,target_col] = 1
    df.loc[df[target_col] <= threshold,target_col] = 0
    return df

# badf = badf_n_top_features(badf,features_by_importance,50,TARGET_COL)
    
badf = convert_target_labels(badf,TARGET_THRESHOLD,TARGET_COL)

In [5]:
def prep_datasets_holdout(df,no_of_features,ignore_after,holdout_set_after,test_set_rows,train_set_rows,target,features_by_importance,cross_val_days):
    df = badf_n_top_features(df,features_by_importance,no_of_features,target)
    df = df[df.index <ignore_after]
    df['target_10d'] = df['target_10d'].astype('int')

    for column in df.columns:
        if df[column].isna().any():
            df[column].fillna(df[column].min(),inplace=True)
        
    train_set = df[-test_set_rows-train_set_rows-cross_val_days:-test_set_rows-cross_val_days-1]
    test_set = df[-test_set_rows-cross_val_days-1:-cross_val_days-1]
    train_set_X = train_set.drop(columns='target_10d')
    train_set_y = train_set['target_10d']
    test_set_X = test_set.drop(columns='target_10d')
    test_set_y = test_set['target_10d']
    
    return df,train_set_X,test_set_X,train_set_y,test_set_y

# Application to 2019 data (150 Days Method)

## Polynomial

In [6]:
C = 0.1
kernel = 'poly'
no_of_features = 5

In [7]:
df,train_set_X,test_set_X,train_set_y,test_set_y = prep_datasets_holdout(badf,no_of_features,'2020-01-01','2019-01-01',150,1000,'target_10d',features_by_importance,0)

In [8]:
model = SVC(C=C,kernel='poly',probability=True,random_state=42)

In [9]:
model.fit(train_set_X,train_set_y)
y_pred = model.predict(test_set_X)
y_pred_proba = model.predict_proba(test_set_X)

In [10]:
confusion_matrix(test_set_y,y_pred)

array([[ 0, 51],
       [ 0, 99]])

In [11]:
roc_auc_score(test_set_y,y_pred_proba[:,-1])

0.704297880768469

## Linear

In [12]:
C = 0.1
kernel = 'linear'
no_of_features = 5

In [13]:
df,train_set_X,test_set_X,train_set_y,test_set_y = prep_datasets_holdout(badf,no_of_features,'2020-01-01','2019-01-01',150,1000,'target_10d',features_by_importance,0)

In [14]:
model = SVC(C=C,kernel=kernel,probability=True,random_state=42)

In [15]:
model.fit(train_set_X,train_set_y)
y_pred = model.predict(test_set_X)
y_pred_proba = model.predict_proba(test_set_X)

In [16]:
confusion_matrix(test_set_y,y_pred)

array([[ 0, 51],
       [ 0, 99]])

In [17]:
roc_auc_score(test_set_y,y_pred_proba[:,-1])

0.7050901168548226

# Application to 2019 data (Single Day prediction Method)

## Linear

In [18]:
C = 0.1
kernel = 'linear'
no_of_features = 5

pred = []

for i in range(len(df[df.index>'2019-01-01'])-1):
    df,train_set_X,test_set_X,train_set_y,test_set_y = prep_datasets_holdout(badf,no_of_features,'2020-01-01','2019-01-01',1,1000,'target_10d',features_by_importance,i)
    model = SVC(C=C,kernel=kernel,probability=True,random_state=42)
    model.fit(train_set_X,train_set_y)
    y_pred = model.predict(test_set_X)
    y_pred_proba = model.predict_proba(test_set_X)
    pred.append({'y_pred':y_pred[0],'Pred Prob':y_pred_proba[0,-1],'Y Actual':test_set_y[0]})    
#    | print('Training set from {} to {}, Test set from {} to {}'.format(min(train_set_X.index),max(train_set_X.index),min(test_set_X.index),max(test_set_X.index)))
predictions_holdout = pd.DataFrame(pred)

In [19]:
confusion_matrix(predictions_holdout['Y Actual'],predictions_holdout['y_pred'])

array([[  2,  81],
       [ 21, 147]])

In [20]:
roc_auc_score(predictions_holdout['Y Actual'],predictions_holdout['Pred Prob'])

0.45510613884107864

In [21]:
accuracy_score(predictions_holdout['Y Actual'],predictions_holdout['y_pred'])

0.5936254980079682

## Polynomial

In [22]:
C = 0.1
kernel = 'poly'
no_of_features = 5

pred = []

for i in range(len(df[df.index>'2019-01-01'])-1):
    df,train_set_X,test_set_X,train_set_y,test_set_y = prep_datasets_holdout(badf,no_of_features,'2020-01-01','2019-01-01',1,1000,'target_10d',features_by_importance,i)
    model = SVC(C=C,kernel=kernel,probability=True,random_state=42)
    model.fit(train_set_X,train_set_y)
    y_pred = model.predict(test_set_X)
    y_pred_proba = model.predict_proba(test_set_X)
    pred.append({'y_pred':y_pred[0],'Pred Prob':y_pred_proba[0,-1],'Y Actual':test_set_y[0]})    
#    | print('Training set from {} to {}, Test set from {} to {}'.format(min(train_set_X.index),max(train_set_X.index),min(test_set_X.index),max(test_set_X.index)))
predictions_holdout = pd.DataFrame(pred)

In [23]:
confusion_matrix(predictions_holdout['Y Actual'],predictions_holdout['y_pred'])

array([[  2,  81],
       [ 14, 154]])

In [24]:
roc_auc_score(predictions_holdout['Y Actual'],predictions_holdout['Pred Prob'])

0.4675487664945496

In [25]:
accuracy_score(predictions_holdout['Y Actual'],predictions_holdout['y_pred'])

0.6215139442231076

# Application to 2019 data (All Data at single shot)

## Linear

In [26]:
C = 0.1
kernel = 'linear'
no_of_features = 5

df,train_set_X,test_set_X,train_set_y,test_set_y = prep_datasets_holdout(badf,no_of_features,'2020-01-01','2019-01-01',len(df[df.index>'2019-01-01'])-1,1000,'target_10d',features_by_importance,0)

In [27]:
model = SVC(C=C,kernel=kernel,probability=True,random_state=42)

In [28]:
model.fit(train_set_X,train_set_y)
y_pred = model.predict(test_set_X)
y_pred_proba = model.predict_proba(test_set_X)

In [29]:
confusion_matrix(test_set_y,y_pred)

array([[46, 37],
       [84, 84]])

In [30]:
roc_auc_score(test_set_y,y_pred_proba[:,-1])

0.5384394721744119

In [31]:
accuracy_score(test_set_y,y_pred)

0.5179282868525896

## Polynomial

In [32]:
C = 0.1
kernel = 'poly'
no_of_features = 5

df,train_set_X,test_set_X,train_set_y,test_set_y = prep_datasets_holdout(badf,no_of_features,'2020-01-01','2019-01-01',len(df[df.index>'2019-01-01'])-1,1000,'target_10d',features_by_importance,0)

In [33]:
model = SVC(C=C,kernel=kernel,probability=True,random_state=42)

In [34]:
model.fit(train_set_X,train_set_y)
y_pred = model.predict(test_set_X)
y_pred_proba = model.predict_proba(test_set_X)

In [35]:
confusion_matrix(test_set_y,y_pred)

array([[54, 29],
       [90, 78]])

In [36]:
roc_auc_score(test_set_y,y_pred_proba[:,-1])

0.541810097532989

In [37]:
accuracy_score(test_set_y,y_pred)

0.5258964143426295