In [1]:
"""Author: Sameer Vajjala Kesava

Predicting direction of stock market.

Fitting Logistic Regression and deriving statistics.
Splitting the data into training set (data from years 2001 to 2004) and test set (2005 data). 
No Cross Validation employed (yet).

Problem statement from Chapter 4
Book: An Introduction to Statistical Learning with Applications in R, 
       Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani

Dataset: Smarket (downloaded from https://www-bcf.usc.edu/~gareth/ISL/ and converted to csv)

Packages used: numpy, pandas, sklearn and scipy"""

'Author: Sameer Vajjala Kesava\n\nChapter 4:Lab Problem - Predicting direction of stock market.\n\nFitting Logistic Regression and deriving statistics.\nSplitting the data into training set (data from years 2001 to 2004) and test set (2005 data). \nNo Cross Validation employed (yet).\n\nBook: An Introduction to Statistical Learning with Applications in R, \n       Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani\n\nDataset: Smarket (downloaded from https://www-bcf.usc.edu/~gareth/ISL/ and converted to csv)\n\nPackages used: numpy, pandas, sklearn and scipy'

In [2]:
import pandas as pd
import numpy as np

In [3]:
pd.options.display.max_columns = 100

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
data_df = pd.read_csv("Smarket")
data_df.head(10)

Unnamed: 0.1,Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,1,2001.0,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
1,2,2001.0,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2,3,2001.0,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
3,4,2001.0,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
4,5,2001.0,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up
5,6,2001.0,0.213,0.614,-0.623,1.032,0.959,1.3491,1.392,Up
6,7,2001.0,1.392,0.213,0.614,-0.623,1.032,1.445,-0.403,Down
7,8,2001.0,-0.403,1.392,0.213,0.614,-0.623,1.4078,0.027,Up
8,9,2001.0,0.027,-0.403,1.392,0.213,0.614,1.164,1.303,Up
9,10,2001.0,1.303,0.027,-0.403,1.392,0.213,1.2326,0.287,Up


In [5]:
data_df.drop(columns='Unnamed: 0', inplace=True)
data_df.head(5)

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001.0,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
1,2001.0,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2,2001.0,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
3,2001.0,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
4,2001.0,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [6]:
data_df.shape

(1250, 9)

In [7]:
data_df.describe()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
count,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0
mean,2003.016,0.003834,0.003919,0.001716,0.001636,0.00561,1.478305,0.003138
std,1.409018,1.136299,1.13628,1.138703,1.138774,1.14755,0.360357,1.136334
min,2001.0,-4.922,-4.922,-4.922,-4.922,-4.922,0.35607,-4.922
25%,2002.0,-0.6395,-0.6395,-0.64,-0.64,-0.64,1.2574,-0.6395
50%,2003.0,0.039,0.039,0.0385,0.0385,0.0385,1.42295,0.0385
75%,2004.0,0.59675,0.59675,0.59675,0.59675,0.597,1.641675,0.59675
max,2005.0,5.733,5.733,5.733,5.733,5.733,3.15247,5.733


In [8]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 9 columns):
Year         1250 non-null float64
Lag1         1250 non-null float64
Lag2         1250 non-null float64
Lag3         1250 non-null float64
Lag4         1250 non-null float64
Lag5         1250 non-null float64
Volume       1250 non-null float64
Today        1250 non-null float64
Direction    1250 non-null object
dtypes: float64(8), object(1)
memory usage: 88.0+ KB


In [9]:
#Converting the Direction column to Category
data_df['Direction'] = data_df['Direction'].astype('category')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 9 columns):
Year         1250 non-null float64
Lag1         1250 non-null float64
Lag2         1250 non-null float64
Lag3         1250 non-null float64
Lag4         1250 non-null float64
Lag5         1250 non-null float64
Volume       1250 non-null float64
Today        1250 non-null float64
Direction    1250 non-null category
dtypes: category(1), float64(8)
memory usage: 79.5 KB


In [10]:
#pandas itself has plotting functions. Can use matplotlib or seaborn for statistical plotting
data_df['Volume'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7f8fbfce70f0>

#### Checking correlations between different variables

In [11]:
#different methods available.
#automatically ignores non-numeric variables
data_df.corr(method='pearson')

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
Year,1.0,0.0297,0.030596,0.033195,0.035689,0.029788,0.539006,0.030095
Lag1,0.0297,1.0,-0.026294,-0.010803,-0.002986,-0.005675,0.04091,-0.026155
Lag2,0.030596,-0.026294,1.0,-0.025897,-0.010854,-0.003558,-0.043383,-0.01025
Lag3,0.033195,-0.010803,-0.025897,1.0,-0.024051,-0.018808,-0.041824,-0.002448
Lag4,0.035689,-0.002986,-0.010854,-0.024051,1.0,-0.027084,-0.048414,-0.0069
Lag5,0.029788,-0.005675,-0.003558,-0.018808,-0.027084,1.0,-0.022002,-0.03486
Volume,0.539006,0.04091,-0.043383,-0.041824,-0.048414,-0.022002,1.0,0.014592
Today,0.030095,-0.026155,-0.01025,-0.002448,-0.0069,-0.03486,0.014592,1.0


#### Logistic Regression to predict stock market direction

In [12]:
#models
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

#### Splitting data into train (2001-2004) and test (2005) sets

In [13]:
#Leaving out 2005 year data out of training set
data_df.set_index('Year').head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001.0,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
2001.0,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2001.0,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
2001.0,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
2001.0,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [14]:
train_data = data_df.set_index('Year')
train_data.drop(index = 2005, inplace=True)

In [15]:
train_data.tail()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004.0,0.046,0.342,0.904,0.038,-0.749,0.9561,-0.431,Down
2004.0,-0.431,0.046,0.342,0.904,0.038,0.922,0.715,Up
2004.0,0.715,-0.431,0.046,0.342,0.904,0.983,-0.007,Down
2004.0,-0.007,0.715,-0.431,0.046,0.342,0.9259,0.008,Up
2004.0,0.008,-0.007,0.715,-0.431,0.046,0.8298,-0.134,Down


In [16]:
train_data.shape

(998, 8)

In [17]:
#Final form of training data
train_data_fit = train_data.drop(columns=['Today'])
train_data_fit.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001.0,0.381,-0.192,-2.624,-1.055,5.01,1.1913,Up
2001.0,0.959,0.381,-0.192,-2.624,-1.055,1.2965,Up
2001.0,1.032,0.959,0.381,-0.192,-2.624,1.4112,Down
2001.0,-0.623,1.032,0.959,0.381,-0.192,1.276,Up
2001.0,0.614,-0.623,1.032,0.959,0.381,1.2057,Up


In [18]:
train_data_fit.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 998 entries, 2001.0 to 2004.0
Data columns (total 7 columns):
Lag1         998 non-null float64
Lag2         998 non-null float64
Lag3         998 non-null float64
Lag4         998 non-null float64
Lag5         998 non-null float64
Volume       998 non-null float64
Direction    998 non-null category
dtypes: category(1), float64(6)
memory usage: 55.6 KB


In [19]:
#Encoding Direction variable
label_encoder = LabelEncoder()
train_data_fit['Direction'] = label_encoder.fit_transform(train_data_fit['Direction'])
train_data_fit.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001.0,0.381,-0.192,-2.624,-1.055,5.01,1.1913,1
2001.0,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1
2001.0,1.032,0.959,0.381,-0.192,-2.624,1.4112,0
2001.0,-0.623,1.032,0.959,0.381,-0.192,1.276,1
2001.0,0.614,-0.623,1.032,0.959,0.381,1.2057,1


In [20]:
label_encoder.classes_

array(['Down', 'Up'], dtype=object)

In [21]:
train_data_fit.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 998 entries, 2001.0 to 2004.0
Data columns (total 7 columns):
Lag1         998 non-null float64
Lag2         998 non-null float64
Lag3         998 non-null float64
Lag4         998 non-null float64
Lag5         998 non-null float64
Volume       998 non-null float64
Direction    998 non-null int64
dtypes: float64(6), int64(1)
memory usage: 62.4 KB


In [22]:
#Converting Direction dtype to category
train_data_fit['Direction'] = train_data_fit['Direction'].astype('category')

In [23]:
train_data_fit.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 998 entries, 2001.0 to 2004.0
Data columns (total 7 columns):
Lag1         998 non-null float64
Lag2         998 non-null float64
Lag3         998 non-null float64
Lag4         998 non-null float64
Lag5         998 non-null float64
Volume       998 non-null float64
Direction    998 non-null category
dtypes: category(1), float64(6)
memory usage: 55.6 KB


In [24]:
#Setting X_train for fitting
x_true_2 = train_data_fit.drop(['Direction'], axis = 1)
x_true_2.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001.0,0.381,-0.192,-2.624,-1.055,5.01,1.1913
2001.0,0.959,0.381,-0.192,-2.624,-1.055,1.2965
2001.0,1.032,0.959,0.381,-0.192,-2.624,1.4112
2001.0,-0.623,1.032,0.959,0.381,-0.192,1.276
2001.0,0.614,-0.623,1.032,0.959,0.381,1.2057


In [25]:
#Settng Y_train for fitting
y_true_2 = train_data_fit[['Direction']]
y_true_2.head()

Unnamed: 0_level_0,Direction
Year,Unnamed: 1_level_1
2001.0,1
2001.0,1
2001.0,0
2001.0,1
2001.0,1


In [26]:
#Logistic Regression on the training data
logreg =  LogisticRegression()
fit_data_2 = logreg.fit(X = x_true_2, y = y_true_2['Direction'])



In [27]:
print('Parameter coefficients: ', fit_data_2.coef_)
print('Intercept: ', fit_data_2.intercept_)
print('Classes: ', fit_data_2.classes_)

Parameter coefficients:  [[-0.05423894 -0.04542453  0.00745773  0.00675954 -0.00405469 -0.09769463]]
Intercept:  [0.16516961]
Classes:  [0 1]


#### Calculating statistics of the fit

In [28]:
newX_2 = pd.DataFrame(np.ones(len(x_true_2)), index = x_true_2.index, columns=['Constant'])
newX_2.head()

Unnamed: 0_level_0,Constant
Year,Unnamed: 1_level_1
2001.0,1.0
2001.0,1.0
2001.0,1.0
2001.0,1.0
2001.0,1.0


In [29]:
newX_2 = pd.concat([newX_2, x_true_2], axis = 1)
newX_2.head()

Unnamed: 0_level_0,Constant,Lag1,Lag2,Lag3,Lag4,Lag5,Volume
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001.0,1.0,0.381,-0.192,-2.624,-1.055,5.01,1.1913
2001.0,1.0,0.959,0.381,-0.192,-2.624,-1.055,1.2965
2001.0,1.0,1.032,0.959,0.381,-0.192,-2.624,1.4112
2001.0,1.0,-0.623,1.032,0.959,0.381,-0.192,1.276
2001.0,1.0,0.614,-0.623,1.032,0.959,0.381,1.2057


In [30]:
newX_2.shape

(998, 7)

In [31]:
#Predicted probabilities: 0: Down, 1: Up
pred_prob_2 = fit_data_2.predict_proba(x_true_2)
pred_prob_2

array([[0.50254295, 0.49745705],
       [0.51142416, 0.48857584],
       [0.51500817, 0.48499183],
       ...,
       [0.48777186, 0.51222814],
       [0.49041986, 0.50958014],
       [0.47845858, 0.52154142]])

In [32]:
#Calculating variance matrix
Variance_2 = np.diagflat(np.prod(pred_prob_2, axis = 1), k = 0)
Variance_2

array([[0.24999353, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.24986949, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.24977475, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.24985047, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.24990822,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.24953597]])

In [33]:
Variance_2.shape

(998, 998)

In [34]:
#Calculating Covariance Matrix of the fit: (X$_{T}$VX)-1
Covariance_matrix_2 = np.linalg.inv(np.dot(np.dot(newX_2.values.transpose(), Variance_2), newX_2.values))
print('Convariance Matrix: \n', Covariance_matrix_2)

Convariance Matrix: 
 [[ 1.11314749e-01  8.81395879e-04 -1.08383518e-03 -1.15484558e-03
  -1.41602966e-03 -6.66332224e-04 -7.84786560e-02]
 [ 8.81395879e-04  2.68140254e-03  5.53805846e-05  1.81469065e-05
  -4.85643701e-06  4.18847333e-06 -6.47964216e-04]
 [-1.08383518e-03  5.53805846e-05  2.68244741e-03  6.74681132e-05
   4.27065432e-05  1.67465771e-05  7.90067282e-04]
 [-1.15484558e-03  1.81469065e-05  6.74681132e-05  2.66684404e-03
   6.86124030e-05  5.92889246e-05  8.47890159e-04]
 [-1.41602966e-03 -4.85643701e-06  4.27065432e-05  6.86124030e-05
   2.67317321e-03  6.84430098e-05  1.04294974e-03]
 [-6.66332224e-04  4.18847333e-06  1.67465771e-05  5.92889246e-05
   6.84430098e-05  2.61480651e-03  4.83384763e-04]
 [-7.84786560e-02 -6.47964216e-04  7.90067282e-04  8.47890159e-04
   1.04294974e-03  4.83384763e-04  5.74003144e-02]]


In [35]:
print('Dimensions: ', Covariance_matrix_2.shape)

Dimensions:  (7, 7)


In [36]:
std_err_2 = np.sqrt(np.diagonal(Covariance_matrix_2))
print('Standard Error of the coefficients: ', std_err_2)

Standard Error of the coefficients:  [0.33363865 0.05178226 0.05179235 0.0516415  0.05170274 0.05113518
 0.23958363]


In [37]:
z_value_2 = np.append(fit_data_2.intercept_, fit_data_2.coef_)/std_err_2
print('Z scores of the fit: ' , z_value_2)

Z scores of the fit:  [ 0.49505538 -1.04744248 -0.87705101  0.14441342  0.13073857 -0.07929349
 -0.40776839]


In [38]:
#assuming t-distribution, probably the right to use. Have to pass absolute values to the cdf function
import scipy.stats as st

p_values_t_distr_2 = 2*(1-st.t.cdf(np.abs(z_value_2),len(newX_2.values)-1))

print('p-values using cdf from t-distribution: ', p_values_t_distr_2)

p-values using cdf from t-distribution:  [0.62067011 0.2951494  0.38067021 0.8852032  0.89600849 0.9368151
 0.68353126]


In [39]:
#Compiling all the derived statistics as a Dataframe
coeff_stats_2 = pd.DataFrame(data=np.transpose([np.append(fit_data_2.intercept_, fit_data_2.coef_),std_err_2, z_value_2, p_values_t_distr_2]),\
                          index = ['Intercept'] + list(x_true_2.columns), columns = ['Estimate', 'Std. Error', 'z value', 'Pr(>|z|)'])

print(coeff_stats_2)

           Estimate  Std. Error   z value  Pr(>|z|)
Intercept  0.165170    0.333639  0.495055  0.620670
Lag1      -0.054239    0.051782 -1.047442  0.295149
Lag2      -0.045425    0.051792 -0.877051  0.380670
Lag3       0.007458    0.051641  0.144413  0.885203
Lag4       0.006760    0.051703  0.130739  0.896008
Lag5      -0.004055    0.051135 -0.079293  0.936815
Volume    -0.097695    0.239584 -0.407768  0.683531


In [40]:
y_true_2.nunique()

Direction    2
dtype: int64

In [41]:
from sklearn.metrics import confusion_matrix

cm_2 =  confusion_matrix(y_true_2['Direction'].values, fit_data_2.predict(x_true_2))

cmdf_2 = pd.DataFrame(cm_2, index = [list(y_true_2.columns)*y_true_2.nunique()[0], label_encoder.classes_],\
                    columns = [['Predicted']*y_true_2.nunique()[0], label_encoder.classes_])
print('Confusion Matrix: \n', cmdf_2)

Confusion Matrix: 
                Predicted     
                    Down   Up
Direction Down       175  316
          Up         155  352


In [42]:
print('Training error rate is {0:.2%}'.format(1 - fit_data_2.score(x_true_2,y_true_2)))

Training error rate is 47.19%


In [43]:
##### Fitting to the test data (2005 data)

In [44]:
test_data = data_df.set_index('Year').loc[2005]

In [45]:
test_data.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005.0,-0.134,0.008,-0.007,0.715,-0.431,0.7869,-0.812,Down
2005.0,-0.812,-0.134,0.008,-0.007,0.715,1.5108,-1.167,Down
2005.0,-1.167,-0.812,-0.134,0.008,-0.007,1.721,-0.363,Down
2005.0,-0.363,-1.167,-0.812,-0.134,0.008,1.7389,0.351,Up
2005.0,0.351,-0.363,-1.167,-0.812,-0.134,1.5691,-0.143,Down


In [46]:
test_data_fit = test_data.drop(columns = ['Today'])
test_data_fit.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005.0,-0.134,0.008,-0.007,0.715,-0.431,0.7869,Down
2005.0,-0.812,-0.134,0.008,-0.007,0.715,1.5108,Down
2005.0,-1.167,-0.812,-0.134,0.008,-0.007,1.721,Down
2005.0,-0.363,-1.167,-0.812,-0.134,0.008,1.7389,Up
2005.0,0.351,-0.363,-1.167,-0.812,-0.134,1.5691,Down


In [47]:
#Encoding Direction. Not really necessary for a binary system
test_data_fit['Direction'] =  label_encoder.fit_transform(test_data_fit['Direction'])
test_data_fit.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005.0,-0.134,0.008,-0.007,0.715,-0.431,0.7869,0
2005.0,-0.812,-0.134,0.008,-0.007,0.715,1.5108,0
2005.0,-1.167,-0.812,-0.134,0.008,-0.007,1.721,0
2005.0,-0.363,-1.167,-0.812,-0.134,0.008,1.7389,1
2005.0,0.351,-0.363,-1.167,-0.812,-0.134,1.5691,0


In [48]:
test_data_fit.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 252 entries, 2005.0 to 2005.0
Data columns (total 7 columns):
Lag1         252 non-null float64
Lag2         252 non-null float64
Lag3         252 non-null float64
Lag4         252 non-null float64
Lag5         252 non-null float64
Volume       252 non-null float64
Direction    252 non-null int64
dtypes: float64(6), int64(1)
memory usage: 15.8 KB


In [49]:
#Final form of test data. Converting Direction to categorical type
test_data_fit['Direction'] = test_data_fit['Direction'].astype('category')
test_data_fit.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 252 entries, 2005.0 to 2005.0
Data columns (total 7 columns):
Lag1         252 non-null float64
Lag2         252 non-null float64
Lag3         252 non-null float64
Lag4         252 non-null float64
Lag5         252 non-null float64
Volume       252 non-null float64
Direction    252 non-null category
dtypes: category(1), float64(6)
memory usage: 14.1 KB


In [50]:
x_test_2 = test_data_fit.drop('Direction', axis = 1)
x_test_2.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2005.0,-0.134,0.008,-0.007,0.715,-0.431,0.7869
2005.0,-0.812,-0.134,0.008,-0.007,0.715,1.5108
2005.0,-1.167,-0.812,-0.134,0.008,-0.007,1.721
2005.0,-0.363,-1.167,-0.812,-0.134,0.008,1.7389
2005.0,0.351,-0.363,-1.167,-0.812,-0.134,1.5691


In [51]:
y_test_2 = test_data_fit[['Direction']]
y_test_2.head()

Unnamed: 0_level_0,Direction
Year,Unnamed: 1_level_1
2005.0,0
2005.0,0
2005.0,0
2005.0,1
2005.0,0


In [52]:
test_cm_2 =  confusion_matrix(y_test_2.values, fit_data_2.predict(x_test_2))

test_cmdf_2 = pd.DataFrame(test_cm_2, index = [list(y_test_2.columns)*y_test_2.nunique()[0], label_encoder.classes_],\
                    columns = [['Predicted']*y_test_2.nunique()[0], label_encoder.classes_])
print('Confusion Matrix: \n', test_cmdf_2)

Confusion Matrix: 
                Predicted    
                    Down  Up
Direction Down        71  40
          Up          89  52


In [53]:
print('Test error rate is {0:.2%}'.format(1 - fit_data_2.score(x_test_2,y_test_2)))
#worse than random guessing in predicting Up.

Test error rate is 51.19%
