In [1]:
import pandas as pd
from pandas.core import datetools
import os
import glob
import ntpath
from datetime import datetime
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
import scipy
from statsmodels import regression
import matplotlib.pyplot as plt
import seaborn as sns

  from ipykernel import kernelapp as app


In [2]:
excelfile = "C:/Users/ROHAN/Documents/GitProjects/Cubist/SignalData.xls"

In [3]:
pwd = os.getcwd()
os.chdir(os.path.dirname(excelfile))
data = pd.read_excel(os.path.basename(excelfile))
os.chdir(pwd)

In [4]:
data.head(10)

Unnamed: 0,symbol,industryId,stockReturn,sig1,sig2,sig3,sig4,sig5,sig6
0,sym0001,13,-0.01028,1.206965,8.95987,6.081876,4.857028,5.099576,37.3113
1,sym0002,54,-0.007853,0.91452,6.41128,4.343364,11.39166,5.095543,54.0172
2,sym0003,24,-0.060575,1.242627,8.69657,6.252786,-0.784681,5.099284,30.6382
3,sym0004,39,0.038937,1.560169,6.40386,4.305594,0.214442,5.098728,72.8453
4,sym0005,49,0.024219,,14.2074,8.145358,5.360521,5.099693,6.4709
5,sym0006,40,-0.030876,1.379096,9.19632,6.55253,0.786777,5.099841,47.561
6,sym0007,53,0.038601,1.389349,6.4,6.501426,0.024944,5.099608,3.6323
7,sym0008,43,0.090956,1.15653,8.90483,5.404435,15.18298,5.097408,1.3083
8,sym0009,21,0.050642,1.350138,13.3568,7.878196,3.84515,5.098654,13.8658
9,sym0010,53,-0.014633,1.069054,6.4,4.3,4.073453,5.099232,32.3387


In [5]:
#Check if there are any nan values
data[np.isnan(data.stockReturn)]

Unnamed: 0,symbol,industryId,stockReturn,sig1,sig2,sig3,sig4,sig5,sig6


In [6]:
#Convert sig2 into float
data.sig2=data.sig2.astype(dtype=float)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1509 entries, 0 to 1508
Data columns (total 9 columns):
symbol         1509 non-null object
industryId     1509 non-null int64
stockReturn    1509 non-null float64
sig1           1383 non-null float64
sig2           1504 non-null float64
sig3           1507 non-null float64
sig4           1501 non-null float64
sig5           1509 non-null float64
sig6           1509 non-null float64
dtypes: float64(7), int64(1), object(1)
memory usage: 106.2+ KB


In [7]:
data[data['industryId'] == 49]['sig1'].mean()

1.150180684210526

In [8]:
data.shape

(1509, 9)

In [9]:
#Fill NA in signals with industry means
data[['stockReturn','sig1','sig2','sig3','sig4','sig5','sig6']] = data.groupby("industryId").transform(lambda x: x.fillna(x.mean()))

In [10]:
data.head()


Unnamed: 0,symbol,industryId,stockReturn,sig1,sig2,sig3,sig4,sig5,sig6
0,sym0001,13,-0.01028,1.206965,8.959871,6.081876,4.857028,5.099576,37.3113
1,sym0002,54,-0.007853,0.91452,6.411277,4.343364,11.39166,5.095543,54.0172
2,sym0003,24,-0.060575,1.242627,8.696568,6.252786,-0.784681,5.099284,30.6382
3,sym0004,39,0.038937,1.560169,6.403864,4.305594,0.214442,5.098728,72.8453
4,sym0005,49,0.024219,1.150181,14.207439,8.145358,5.360521,5.099693,6.4709


In [11]:
def winsorize_series(s):
    q = s.quantile([0.01, 0.99])
    if isinstance(q, pd.Series) and len(q) == 2:
        s[s < q.iloc[0]] = q.iloc[0]
        s[s > q.iloc[1]] = q.iloc[1]
    return s

def winsorize_df(df):
    return df.apply(winsorize_series)

In [12]:
#winsorize_df(data[['sig1','sig2','sig3','sig4','sig5','sig6']])

Demean Returns and signals within industry group

In [13]:
data['stockReturnN'] = data['stockReturn'] - data.groupby('industryId')['stockReturn'].transform('mean')
data['sig1n'] = data['sig1'] - data.groupby('industryId')['sig1'].transform('mean')
data['sig2n'] = data['sig2'] - data.groupby('industryId')['sig2'].transform('mean')
data['sig3n'] = data['sig3'] - data.groupby('industryId')['sig3'].transform('mean')
data['sig4n'] = data['sig4'] - data.groupby('industryId')['sig4'].transform('mean')
data['sig5n'] = data['sig5'] - data.groupby('industryId')['sig5'].transform('mean')
data['sig6n'] = data['sig6'] - data.groupby('industryId')['sig6'].transform('mean')

In [14]:
data.head()

Unnamed: 0,symbol,industryId,stockReturn,sig1,sig2,sig3,sig4,sig5,sig6,stockReturnN,sig1n,sig2n,sig3n,sig4n,sig5n,sig6n
0,sym0001,13,-0.01028,1.206965,8.959871,6.081876,4.857028,5.099576,37.3113,-0.055509,-0.09072229,-4.941177e+19,-0.822164,-0.200482,0.000361,-11.782571
1,sym0002,54,-0.007853,0.91452,6.411277,4.343364,11.39166,5.095543,54.0172,-0.002768,-0.220626,-2e+19,-1.62858,7.274115,-0.002466,14.858119
2,sym0003,24,-0.060575,1.242627,8.696568,6.252786,-0.784681,5.099284,30.6382,-0.059701,0.003021474,-2.436765,-0.113541,0.181222,0.000188,-16.1622
3,sym0004,39,0.038937,1.560169,6.403864,4.305594,0.214442,5.098728,72.8453,0.019779,0.2437621,-8.4e+19,-2.834517,-1.972511,-0.000627,-3.37535
4,sym0005,49,0.024219,1.150181,14.207439,8.145358,5.360521,5.099693,6.4709,0.029995,2.220446e-16,-3.962264e+18,0.386173,96.328437,0.000227,-47.032376


<b>Standardize signals</b>

In [15]:
from sklearn import preprocessing,decomposition, linear_model, pipeline, metrics
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import statsmodels.api as sms

df_normalized = data[['symbol','industryId','stockReturnN','sig1n','sig2n','sig3n','sig4n','sig5n','sig6n']]

mapper_df = DataFrameMapper([
    (['symbol'],None),
    (['industryId'],  None),
    (['stockReturnN'], preprocessing.StandardScaler(),{'alias': 'normstockReturn'}),
    (['sig1n'], preprocessing.StandardScaler(),{'alias': 'normsig1'}),
    (['sig2n'], preprocessing.StandardScaler(),{'alias': 'normsig2'}),
    (['sig3n'], preprocessing.StandardScaler(),{'alias': 'normsig3'}),
    (['sig4n'], preprocessing.StandardScaler(),{'alias': 'normsig4'}),
    (['sig5n'], preprocessing.StandardScaler(),{'alias': 'normsig5'}),
    (['sig6n'], preprocessing.StandardScaler(),{'alias': 'normsig6'})
], df_out=True, default=None) #df_out returns dataframe and default None lets pass unmapped columns unchanged

df_normalized_scaled = mapper_df.fit_transform(df_normalized.copy()) 



In [16]:
df_normalized_scaled.head()

Unnamed: 0,symbol,industryId,normstockReturn,normsig1,normsig2,normsig3,normsig4,normsig5,normsig6
0,sym0001,13,-0.955093,-0.399479,-0.619733,-0.50258,-0.000754382,0.271068,-0.418239
1,sym0002,54,-0.0476233,-0.971487,-0.250844,-0.995533,0.0273713,-1.85015,0.52741
2,sym0003,24,-1.02721,0.0133045,-3.05624e-20,-0.0694067,0.000681907,0.140833,-0.5737
3,sym0004,39,0.34032,1.07336,-1.05355,-1.73271,-0.00742223,-0.470357,-0.119813
4,sym0005,49,0.516102,8.11862e-16,-0.0496956,0.236063,0.362468,0.170329,-1.66948


In [17]:
## Robust Linear Model RLM
#Complete_Data_array
Complete_Data = df_normalized_scaled[['normsig1','normsig2','normsig3','normsig4','normsig5','normsig6']]
Complete_Data_array=Complete_Data.as_matrix()
Complete_Data_array=sms.add_constant(Complete_Data_array)
#Dependent_Variable_array
Dependent_Variable_array = df_normalized_scaled[['normstockReturn']].as_matrix()

In [18]:
model_RLM = sms.RLM(Dependent_Variable_array.astype(np.float64), Complete_Data_array.astype(np.float64), M=sms.robust.norms.HuberT())
result_RLM=model_RLM.fit()
print(result_RLM.summary())

                    Robust linear Model Regression Results                    
Dep. Variable:                      y   No. Observations:                 1509
Model:                            RLM   Df Residuals:                     1502
Method:                          IRLS   Df Model:                            6
Norm:                          HuberT                                         
Scale Est.:                       mad                                         
Cov Type:                          H1                                         
Date:                Thu, 03 May 2018                                         
Time:                        12:12:35                                         
No. Iterations:                    24                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0003      0.022      0.012      0.9

In [19]:
model_OLS  = sms.OLS(Dependent_Variable_array.astype(np.float64), Complete_Data_array.astype(np.float64))
result_OLS = model_OLS.fit()
print(result_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.018
Model:                            OLS   Adj. R-squared:                  0.014
Method:                 Least Squares   F-statistic:                     4.496
Date:                Thu, 03 May 2018   Prob (F-statistic):           0.000160
Time:                        12:12:35   Log-Likelihood:                -2127.7
No. Observations:                1509   AIC:                             4269.
Df Residuals:                    1502   BIC:                             4307.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       7.589e-19      0.026   2.97e-17      1.0

In [20]:
df_normalized_scaled['OLS_Sig'] = result_OLS.params[1]*df_normalized_scaled["normsig1"] + result_OLS.params[2]*df_normalized_scaled["normsig2"] + result_OLS.params[3]*df_normalized_scaled["normsig3"] + result_OLS.params[4]*df_normalized_scaled["normsig4"] + result_OLS.params[5]*df_normalized_scaled["normsig5"]+ result_OLS.params[6]*df_normalized_scaled["normsig6"]
df_normalized_scaled['RLM_Sig'] = result_RLM.params[1]*df_normalized_scaled["normsig1"] + result_RLM.params[2]*df_normalized_scaled["normsig2"] + result_RLM.params[3]*df_normalized_scaled["normsig3"] + result_RLM.params[4]*df_normalized_scaled["normsig4"] + result_RLM.params[5]*df_normalized_scaled["normsig5"]+ result_RLM.params[6]*df_normalized_scaled["normsig6"]

In [21]:
print((data["stockReturn"] * data["sig1"]).sum() / data["sig1"].sum())
print((data["stockReturn"] * data["sig2"]).sum() / data["sig2"].sum())
print((data["stockReturn"] * data["sig3"]).sum() / data["sig3"].sum())
print((data["stockReturn"] * data["sig4"]).sum() / data["sig4"].sum())
print((data["stockReturn"] * data["sig5"]).sum() / data["sig5"].sum())
print((data["stockReturn"] * data["sig6"]).sum() / data["sig6"].sum())

0.02095695204167168
0.03351914509744065
0.019826081143893007
-0.42003386536009657
0.01907458202776958
0.018738705762017013


In [22]:
print((data["stockReturn"] * df_normalized_scaled["normsig1"]).sum() *2.0 / df_normalized_scaled["normsig1"].abs().sum())
print((data["stockReturn"] * df_normalized_scaled["normsig2"]).sum() *2.0 / df_normalized_scaled["normsig2"].abs().sum())
print((data["stockReturn"] * df_normalized_scaled["normsig3"]).sum() *2.0 / df_normalized_scaled["normsig3"].abs().sum())
print((data["stockReturn"] * df_normalized_scaled["normsig4"]).sum() *2.0 / df_normalized_scaled["normsig4"].abs().sum())
print((data["stockReturn"] * df_normalized_scaled["normsig5"]).sum() *2.0 / df_normalized_scaled["normsig5"].abs().sum())
print((data["stockReturn"] * df_normalized_scaled["normsig6"]).sum() *2.0 / df_normalized_scaled["normsig6"].abs().sum())

0.021457852424749194
-0.00039127371095992497
0.00575808793921613
0.005423038908588497
-0.015473973097633653
-0.0006414894607192867


In [23]:
print((data["stockReturn"] * df_normalized_scaled['RLM_Sig']).sum() *2.0 / df_normalized_scaled['RLM_Sig'].abs().sum())
print((data["stockReturn"] * df_normalized_scaled['OLS_Sig']).sum() *2.0 / df_normalized_scaled['OLS_Sig'].abs().sum())

0.020732167509222625
0.02378974360625781


In [24]:
%matplotlib notebook

import sklearn
#import csv
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import itertools
from sklearn import preprocessing
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

plt.ioff()
pd.set_option('display.max_columns', None)
X = np.array(df_normalized_scaled[["normsig1","normsig2","normsig3","normsig4","normsig5","normsig6"]])
Y = np.array(df_normalized_scaled["normstockReturn"])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1/4., random_state=0)
print(X_train.shape)
print(Y_train.shape)

(1131, 6)
(1131,)


In [25]:
classifiers = [
    svm.SVR(kernel='rbf', C=1.0),
    linear_model.SGDRegressor(),
    linear_model.BayesianRidge(),
    linear_model.LassoLars(),
    linear_model.ARDRegression(),
    linear_model.PassiveAggressiveRegressor(),
    linear_model.TheilSenRegressor(),
    linear_model.LinearRegression()]

for item in classifiers:
    print(item)
    clf = item
    clf.fit(X_train, Y_train)
    
    Y_train_pred = clf.predict(X_train)
    Y_test_pred = clf.predict(X_test)

    train_mse = sklearn.metrics.mean_squared_error(Y_train, Y_train_pred)
    test_mse = sklearn.metrics.mean_squared_error(Y_test, Y_test_pred)
    print("Train MSE {}".format(train_mse))
    print("Test MSE {}".format(test_mse))
    
    if hasattr(clf, 'kernel') and clf.kernel == 'linear':
        coefficients = pd.DataFrame({"Coefficients":np.transpose(clf.coef_)})
        print(coefficients)
   

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
Train MSE 0.9056435045279508
Test MSE 1.0086264593122933
SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=None, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)
Train MSE 0.9889994535063158
Test MSE 0.9997178989763371
BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)
Train MSE 0.9891786200891407
Test MSE 0.9797243368830316
LassoLars(alpha=1.0, copy_X=True, eps=2.220446049250313e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False, precompute='auto', verbose

In [26]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
clf = AdaBoostRegressor() #base_estimator='DecisionTreeRegressor',n_estimators=300, learning_rate=1.0, loss='linear', random_state=None
clf.fit(X_train, Y_train)
accuracy = clf.score(X_test, Y_test)
 