In [90]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix,classification_report

In [69]:
d = pd.read_csv("/home/sandeep/Documents/datasets-master/Smarket.csv", parse_dates = True)

In [70]:
d.dtypes

Year           int64
Lag1         float64
Lag2         float64
Lag3         float64
Lag4         float64
Lag5         float64
Volume       float64
Today        float64
Direction     object
dtype: object

In [71]:
d.head(5)

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [72]:
d.tail(5)

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
1245,2005,0.422,0.252,-0.024,-0.584,-0.285,1.8885,0.043,Up
1246,2005,0.043,0.422,0.252,-0.024,-0.584,1.28581,-0.955,Down
1247,2005,-0.955,0.043,0.422,0.252,-0.024,1.54047,0.13,Up
1248,2005,0.13,-0.955,0.043,0.422,0.252,1.42236,-0.298,Down
1249,2005,-0.298,0.13,-0.955,0.043,0.422,1.38254,-0.489,Down


In [15]:
# Let's train the data upto 2004  and test it on 2005.
# For that we need to give that index value (upto where 2004 ends) for train and test (where 2005 begins).
# In the last stage we will find the accuracy of the test by creating a confusion matrix.

In [93]:
d[:1000].tail(6)

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
994,2004,-0.431,0.046,0.342,0.904,0.038,0.922,0.715,Up
995,2004,0.715,-0.431,0.046,0.342,0.904,0.983,-0.007,Down
996,2004,-0.007,0.715,-0.431,0.046,0.342,0.9259,0.008,Up
997,2004,0.008,-0.007,0.715,-0.431,0.046,0.8298,-0.134,Down
998,2005,-0.134,0.008,-0.007,0.715,-0.431,0.7869,-0.812,Down
999,2005,-0.812,-0.134,0.008,-0.007,0.715,1.5108,-1.167,Down


In [94]:
# splitting training data

x_train = d[:998][['Lag1','Lag2','Lag3','Lag4','Lag5', 'Volume']]
y_train = d[:998]['Direction']     # dependent variable

In [95]:
print(x_train.head(3))
print(x_train.tail(3))
print("\n")
print(y_train.head(3))

    Lag1   Lag2   Lag3   Lag4   Lag5  Volume
0  0.381 -0.192 -2.624 -1.055  5.010  1.1913
1  0.959  0.381 -0.192 -2.624 -1.055  1.2965
2  1.032  0.959  0.381 -0.192 -2.624  1.4112
      Lag1   Lag2   Lag3   Lag4   Lag5  Volume
995  0.715 -0.431  0.046  0.342  0.904  0.9830
996 -0.007  0.715 -0.431  0.046  0.342  0.9259
997  0.008 -0.007  0.715 -0.431  0.046  0.8298


0      Up
1      Up
2    Down
Name: Direction, dtype: object


In [96]:
# splitting testing data

x_test = d[998:][['Lag1','Lag2','Lag3','Lag4','Lag5','Volume']]
y_test = d[998:]['Direction']      # dependent variable

In [109]:
print(x_test.head(3))
print("\n")
print(y_test.head(3))

       Lag1   Lag2   Lag3   Lag4   Lag5  Volume
998  -0.134  0.008 -0.007  0.715 -0.431  0.7869
999  -0.812 -0.134  0.008 -0.007  0.715  1.5108
1000 -1.167 -0.812 -0.134  0.008 -0.007  1.7210


998     Down
999     Down
1000    Down
Name: Direction, dtype: object


In [98]:
 # Fitting the model using LinearDiscriminantAnalysis() function.
model = LinearDiscriminantAnalysis().fit(x_train, y_train)
print(model)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)


In [99]:
# printing the priors
print(model.priors_)

[0.49198397 0.50801603]


In [30]:
# This indicates that in 49.19% case the market trend is 'Down' and the rest 'Up'.

In [100]:
# Then we will see how the prediction works.

In [101]:
lda_pred = model.predict(x_test)              
print(np.unique(lda_pred, return_counts=True))      #return the number of times each unique item appears in lda_pred.

(array(['Down', 'Up'], dtype='<U4'), array([174,  78]))


In [102]:
# We can see that the model has predicted 67 'Down' and 185 'Up' direction values.

In [103]:
# For detailed classification report.

print(confusion_matrix(lda_pred, y_test))
print(classification_report(y_test, lda_pred))

[[77 97]
 [34 44]]
              precision    recall  f1-score   support

        Down       0.44      0.69      0.54       111
          Up       0.56      0.31      0.40       141

    accuracy                           0.48       252
   macro avg       0.50      0.50      0.47       252
weighted avg       0.51      0.48      0.46       252



In [104]:
# Hence we have analyzed the stock market trend over the years 2001 to 2005.