In [1]:
import pandas as pd
from pathlib import Path
import matplotlib as plot

%store -r

### Run PrepareData to get all data sets

In [2]:
%run PrepareData.ipynb

Stored 'tsla_tweets_df' (DataFrame)
Stored 'tsla_stock_values_df' (DataFrame)
Stored 'ta_df' (DataFrame)
Stored 'tsla_sentiments_df' (DataFrame)
Stored 'tsla_sentiments_df_textblob' (DataFrame)


## Create train and test sets

In [3]:
ta_df.head()

Unnamed: 0_level_0,close_value,Adj Close,price_direction,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-02-20,14.474,14.474,1,0.168103,-0.077667,0.245769,3.588046,58.709798,-1538395000.0
2015-02-23,13.822667,13.822667,0,0.139959,-0.084648,0.224607,5.724902,47.638976,-2112134000.0
2015-02-24,13.607333,13.607333,0,0.099137,-0.100376,0.199513,2.738717,44.641846,-2266444000.0
2015-02-25,13.584,13.584,0,0.064163,-0.108281,0.172443,2.237926,44.316488,-2276500000.0
2015-02-26,13.812667,13.812667,1,0.054271,-0.094538,0.148809,4.281102,48.293631,-2113056000.0


In [4]:
# Creating previous day's technical indicators by shifting collumns down:
df= ta_df.loc[:, ta_df.columns != 'price_direction'].shift(1)
df['price_direction'] = ta_df['price_direction']

df.dropna(inplace= True)

df.head()

Unnamed: 0_level_0,close_value,Adj Close,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT,price_direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-02-23,14.474,14.474,0.168103,-0.077667,0.245769,3.588046,58.709798,-1538395000.0,0
2015-02-24,13.822667,13.822667,0.139959,-0.084648,0.224607,5.724902,47.638976,-2112134000.0,0
2015-02-25,13.607333,13.607333,0.099137,-0.100376,0.199513,2.738717,44.641846,-2266444000.0,0
2015-02-26,13.584,13.584,0.064163,-0.108281,0.172443,2.237926,44.316488,-2276500000.0,1
2015-02-27,13.812667,13.812667,0.054271,-0.094538,0.148809,4.281102,48.293631,-2113056000.0,0


In [5]:
# Separate the data into labels and features

# Separate the y variable, the labels
y= df['price_direction']

# Separate the X variable, the features
X= df.drop('price_direction', axis=1)

In [6]:
y.head()

Date
2015-02-23    0
2015-02-24    0
2015-02-25    0
2015-02-26    1
2015-02-27    0
Name: price_direction, dtype: int64

In [7]:
X.head()

Unnamed: 0_level_0,close_value,Adj Close,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-02-23,14.474,14.474,0.168103,-0.077667,0.245769,3.588046,58.709798,-1538395000.0
2015-02-24,13.822667,13.822667,0.139959,-0.084648,0.224607,5.724902,47.638976,-2112134000.0
2015-02-25,13.607333,13.607333,0.099137,-0.100376,0.199513,2.738717,44.641846,-2266444000.0
2015-02-26,13.584,13.584,0.064163,-0.108281,0.172443,2.237926,44.316488,-2276500000.0
2015-02-27,13.812667,13.812667,0.054271,-0.094538,0.148809,4.281102,48.293631,-2113056000.0


In [8]:
# Check the balance of our target values
y.value_counts()

1    676
0    651
Name: price_direction, dtype: int64

In [9]:
from pandas.tseries.offsets import DateOffset
# Select the start of the training period
training_begin = X.index.min()

# Display the training begin date
print(training_begin)

2015-02-23 00:00:00


In [10]:
# Select the ending period for the training data with an offset of 3 months
training_end = X.index.min() + DateOffset(months=3)

# Display the training end date
print(training_end)

2015-05-23 00:00:00


In [11]:
# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

# Review the X_train DataFrame
X_train.head()

Unnamed: 0_level_0,close_value,Adj Close,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-02-23,14.474,14.474,0.168103,-0.077667,0.245769,3.588046,58.709798,-1538395000.0
2015-02-24,13.822667,13.822667,0.139959,-0.084648,0.224607,5.724902,47.638976,-2112134000.0
2015-02-25,13.607333,13.607333,0.099137,-0.100376,0.199513,2.738717,44.641846,-2266444000.0
2015-02-26,13.584,13.584,0.064163,-0.108281,0.172443,2.237926,44.316488,-2276500000.0
2015-02-27,13.812667,13.812667,0.054271,-0.094538,0.148809,4.281102,48.293631,-2113056000.0


In [12]:
# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end+DateOffset(hours=1):]
y_test = y.loc[training_end+DateOffset(hours=1):]

# Review the X_test DataFrame
X_test.head()

Unnamed: 0_level_0,close_value,Adj Close,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-05-26,16.515333,16.515333,0.597889,-0.016814,0.614703,1.449163,70.612853,765126100.0
2015-05-27,16.497334,16.497334,0.578869,-0.028667,0.607536,2.222582,70.076266,759406400.0
2015-05-28,16.495333,16.495333,0.557211,-0.04026,0.597471,1.596405,70.012578,758786400.0
2015-05-29,16.763332,16.763332,0.555272,-0.033759,0.589031,2.684435,73.488255,847672900.0
2015-06-01,16.719999,16.719999,0.543968,-0.036051,0.580019,1.371603,72.034436,832980000.0


## Using ML model to predict price movement

In [13]:
from sklearn.preprocessing import StandardScaler

# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
y_pred = rf_model.predict(X_test_scaled)

In [15]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_pred)

0.5085007902461052

In [16]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, y_pred)

array([[457, 161],
       [466, 179]], dtype=int64)

In [17]:
# Print the classification report for the model
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.50      0.74      0.59       618
           1       0.53      0.28      0.36       645

    accuracy                           0.50      1263
   macro avg       0.51      0.51      0.48      1263
weighted avg       0.51      0.50      0.48      1263



### Model Evaluation
* Random Forest model has mediocre performance with only 51% overall accuracy.
* The only upside is that it has a higher recall score when predicting price fall, meaning it picks up more correct price fall instances.
* However, it has low recall for price increase.


## Adding Vader Sentimental Score and run ML again

In [18]:
tsla_sentiments_df.head()

Unnamed: 0_level_0,polarityScore
Date,Unnamed: 1_level_1
2015-01-01,-0.2732
2015-01-02,0.10803
2015-01-03,0.113978
2015-01-04,0.119152
2015-01-05,0.1789


In [19]:
# Shift polarityScore down 1 row to make polarityScore from the previous day
tsla_sentiments_df= tsla_sentiments_df.shift(1)
tsla_sentiments_df.head()

Unnamed: 0_level_0,polarityScore
Date,Unnamed: 1_level_1
2015-01-01,
2015-01-02,-0.2732
2015-01-03,0.10803
2015-01-04,0.113978
2015-01-05,0.119152


In [20]:
#Concated tsla_sentiments_df in to the main df
df2= pd.concat([df,tsla_sentiments_df], axis=1)
df2= df2.dropna()
#Separate the y variable (the labels), and X variable (the features)
y= df2['price_direction']
X= df2.drop('price_direction', axis=1)

X.head()

Unnamed: 0_level_0,close_value,Adj Close,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT,polarityScore
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-02-23,14.474,14.474,0.168103,-0.077667,0.245769,3.588046,58.709798,-1538395000.0,0.068764
2015-02-24,13.822667,13.822667,0.139959,-0.084648,0.224607,5.724902,47.638976,-2112134000.0,0.110673
2015-02-25,13.607333,13.607333,0.099137,-0.100376,0.199513,2.738717,44.641846,-2266444000.0,0.027055
2015-02-26,13.584,13.584,0.064163,-0.108281,0.172443,2.237926,44.316488,-2276500000.0,0.129564
2015-02-27,13.812667,13.812667,0.054271,-0.094538,0.148809,4.281102,48.293631,-2113056000.0,0.024365


In [21]:
# Rerun ML randomforest model on the new data set

# Create train and test sets
training_begin = X.index.min()
training_end = X.index.min() + DateOffset(months=3)

X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

X_test = X.loc[training_end:]
y_test = y.loc[training_end:]

# Scaling data
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)
rf_model = rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)

In [22]:
# Print the classification report for the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.49      0.73      0.59       572
         1.0       0.51      0.28      0.36       588

    accuracy                           0.50      1160
   macro avg       0.50      0.50      0.47      1160
weighted avg       0.50      0.50      0.47      1160



### Model Evaluation

* We tried to improve the model by incorporating the Vader Sentimental score from daily tweets about Tesla stock from social media platform Twitter, which represents a proxy of market sentiment toward the stock.
* The model does not improve, but performs slightly worse overall.


## Adding Textblob Sentimental Scores and run ML again

In [23]:
tsla_sentiments_df_textblob.head()

Unnamed: 0_level_0,Subjectivity,Polarity
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01,0.4,-0.15
2015-01-02,0.406539,0.180581
2015-01-03,0.210845,0.046556
2015-01-04,0.201275,0.076732
2015-01-05,0.078016,0.031528


In [24]:
# Shift Polarity and Subjectivity Scores down 1 row to make polarityScore from the previous day
tsla_sentiments_df_textblob= tsla_sentiments_df_textblob.shift(1)

#Concated tsla_sentiments_df_textblob into the main df
df3= pd.concat([df,tsla_sentiments_df_textblob], axis=1)
df3= df3.dropna()

#Separate the y variable (the labels), and X variable (the features)
y= df3['price_direction']
X= df3.drop('price_direction', axis=1)

X.tail()

Unnamed: 0_level_0,close_value,Adj Close,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT,Subjectivity,Polarity
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-12-24,27.948,27.948,1.43091,0.395042,1.035868,2.864852,83.326865,9964267000.0,0.378831,0.092167
2019-12-26,28.35,28.35,1.562356,0.421191,1.141165,3.005295,84.352601,10138050000.0,0.380445,0.111968
2019-12-27,28.729334,28.729334,1.677797,0.429305,1.248492,1.654526,85.273271,10351480000.0,0.394294,0.111125
2019-12-30,28.691999,28.691999,1.746143,0.398121,1.348022,2.137644,84.744767,10332090000.0,0.382562,0.072515
2019-12-31,27.646667,27.646667,1.696403,0.278705,1.417698,4.760067,71.401258,9644257000.0,0.392603,0.080258


In [25]:
# Rerun ML randomforest model on the new data set

# Create train and test sets
training_begin = X.index.min()
training_end = X.index.min() + DateOffset(months=3)

X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

X_test = X.loc[training_end:]
y_test = y.loc[training_end:]

# Scaling data
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)
rf_model = rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)

In [26]:
# Print the classification report for the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.51      0.42      0.46       572
         1.0       0.52      0.60      0.56       588

    accuracy                           0.51      1160
   macro avg       0.51      0.51      0.51      1160
weighted avg       0.51      0.51      0.51      1160



### Model Evaluation

* We then used another sentimental analysis tool textblob, and incorporated both sentiment and polarity score in the features data to predict price direction.
* We only achieve slight improvement in overall recall.
* Prediction of price increase now has higher recall at the expense of price fall recall.


## Using Gradient Boosting Classifier with Textblob Sentimental Scores

In [27]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a random forest classifier
gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=1, random_state=1)
gb_model = gb_model.fit(X_train_scaled, y_train)
y_pred_gb = gb_model.predict(X_test_scaled)

In [28]:
# Print the classification report for the model
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

         0.0       0.50      0.71      0.59       572
         1.0       0.53      0.31      0.39       588

    accuracy                           0.51      1160
   macro avg       0.51      0.51      0.49      1160
weighted avg       0.51      0.51      0.49      1160



### Model Evaluation

* We changed the ML algorithm to Gradient Boost, on the same data set with Textblob sentiment scores. However, this did not significantly improve model performance. 
* This improves the recall score of price fall prediction, yet reduces the recall score of price increase.
* The trade-off in call score and no significant improvement on precision of both price direction suggest the limitation of the ML models based on the bagging method.