In [1]:
import pandas as pd
from pathlib import Path
import matplotlib as plot

%store -r

### Run PrepareData to get all data sets

In [2]:
%run PrepareData2.ipynb

Stored 'tsla_tweets_df' (DataFrame)
Stored 'tsla_stock_values_df' (DataFrame)
Stored 'ta_df' (DataFrame)
Stored 'tsla_sentiments_df' (DataFrame)
Stored 'tsla_sentiments_df_textblob' (DataFrame)


## Create train and test sets

In [3]:
ta_df.head()

Unnamed: 0_level_0,close_value,price_direction,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT
day_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-02-03,218.36,1,0.589223,2.354853,-1.765629,4.16743,68.180272,-54044970.0
2015-02-04,218.55,1,1.54253,2.646528,-1.103997,2.140929,68.350684,-53757440.0
2015-02-05,220.99,1,2.466488,2.856388,-0.3899,2.643559,70.533202,-49829420.0
2015-02-06,217.36,0,2.872706,2.610085,0.262621,3.174411,63.515795,-55144660.0
2015-02-07,217.36,1,3.158231,2.316488,0.841743,3.174411,63.515795,-55144660.0


In [4]:
# Creating previous day's technical indicators by shifting collumns down:
df= ta_df.loc[:, ta_df.columns != 'price_direction'].shift(1)
df['price_direction'] = ta_df['price_direction']

df.dropna(inplace= True)

df.head()

Unnamed: 0_level_0,close_value,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT,price_direction
day_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-02-04,218.36,0.589223,2.354853,-1.765629,4.16743,68.180272,-54044970.0,1
2015-02-05,218.55,1.54253,2.646528,-1.103997,2.140929,68.350684,-53757440.0,1
2015-02-06,220.99,2.466488,2.856388,-0.3899,2.643559,70.533202,-49829420.0,0
2015-02-07,217.36,2.872706,2.610085,0.262621,3.174411,63.515795,-55144660.0,1
2015-02-08,217.36,3.158231,2.316488,0.841743,3.174411,63.515795,-55144660.0,1


In [5]:
# Separate the data into labels and features

# Separate the y variable, the labels
y= df['price_direction']

# Separate the X variable, the features
X= df.drop('price_direction', axis=1)

In [6]:
y.head()

day_date
2015-02-04    1
2015-02-05    1
2015-02-06    0
2015-02-07    1
2015-02-08    1
Name: price_direction, dtype: int64

In [7]:
X.head()

Unnamed: 0_level_0,close_value,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT
day_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-02-04,218.36,0.589223,2.354853,-1.765629,4.16743,68.180272,-54044970.0
2015-02-05,218.55,1.54253,2.646528,-1.103997,2.140929,68.350684,-53757440.0
2015-02-06,220.99,2.466488,2.856388,-0.3899,2.643559,70.533202,-49829420.0
2015-02-07,217.36,2.872706,2.610085,0.262621,3.174411,63.515795,-55144660.0
2015-02-08,217.36,3.158231,2.316488,0.841743,3.174411,63.515795,-55144660.0


In [8]:
# Check the balance of our target values
y.value_counts()

1    1240
0     655
Name: price_direction, dtype: int64

In [9]:
from pandas.tseries.offsets import DateOffset
# Select the start of the training period
training_begin = X.index.min()

# Display the training begin date
print(training_begin)

2015-02-04 00:00:00


In [10]:
# Select the ending period for the training data with an offset of 3 months
training_end = X.index.min() + DateOffset(months=3)

# Display the training end date
print(training_end)

2015-05-04 00:00:00


In [11]:
# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

# Review the X_train DataFrame
X_train.head()

Unnamed: 0_level_0,close_value,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT
day_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-02-04,218.36,0.589223,2.354853,-1.765629,4.16743,68.180272,-54044970.0
2015-02-05,218.55,1.54253,2.646528,-1.103997,2.140929,68.350684,-53757440.0
2015-02-06,220.99,2.466488,2.856388,-0.3899,2.643559,70.533202,-49829420.0
2015-02-07,217.36,2.872706,2.610085,0.262621,3.174411,63.515795,-55144660.0
2015-02-08,217.36,3.158231,2.316488,0.841743,3.174411,63.515795,-55144660.0


In [12]:
# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end+DateOffset(hours=1):]
y_test = y.loc[training_end+DateOffset(hours=1):]

# Review the X_test DataFrame
X_test.head()

Unnamed: 0_level_0,close_value,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT
day_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-05-05,230.51,6.799641,0.323449,6.476192,3.305713,71.735993,-4345468.0
2015-05-06,232.95,6.98401,0.406254,6.577756,4.451599,73.813015,1775683.0
2015-05-07,230.43,6.847842,0.216069,6.631773,2.721,68.235829,-3723830.0
2015-05-08,236.8,7.171269,0.431597,6.739672,7.276182,73.654701,22371810.0
2015-05-09,236.61,7.327785,0.47049,6.857295,1.990575,73.253282,21997470.0


## Using ML model to predict price movement

In [13]:
from sklearn.preprocessing import StandardScaler

# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
y_pred = rf_model.predict(X_test_scaled)

In [15]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_pred)

0.5047608773529604

In [16]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, y_pred)

array([[333, 291],
       [619, 562]], dtype=int64)

In [17]:
# Print the classification report for the model
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.35      0.53      0.42       624
           1       0.66      0.48      0.55      1181

    accuracy                           0.50      1805
   macro avg       0.50      0.50      0.49      1805
weighted avg       0.55      0.50      0.51      1805



### Model Evaluation
* Random Forest model has average performance with only 50% overall accuracy.  
* It does not predict price fall well ('0') with only 35% precision. 
* The model performs better when predicting price increases. However, this model has low recall, suggesting it only picks up about half of the correct instances.


## Adding Vader Sentimental Score and run ML again

In [18]:
tsla_sentiments_df.head()

Unnamed: 0_level_0,polarityScore
Date,Unnamed: 1_level_1
2015-01-01,-0.2732
2015-01-02,0.10803
2015-01-03,0.113978
2015-01-04,0.119152
2015-01-05,0.1789


In [19]:
# Shift polarityScore down 1 row to make polarityScore from the previous day
tsla_sentiments_df= tsla_sentiments_df.shift(1)
tsla_sentiments_df.head()

Unnamed: 0_level_0,polarityScore
Date,Unnamed: 1_level_1
2015-01-01,
2015-01-02,-0.2732
2015-01-03,0.10803
2015-01-04,0.113978
2015-01-05,0.119152


In [20]:
#Concated tsla_sentiments_df in to the main df
df2= pd.concat([df,tsla_sentiments_df], axis=1)
df2= df2.dropna()
#Separate the y variable (the labels), and X variable (the features)
y= df2['price_direction']
X= df2.drop('price_direction', axis=1)

X.tail()

Unnamed: 0,close_value,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT,polarityScore
2019-12-27,430.94,23.0892,4.646278,18.442922,1.654523,90.031012,683669900.0,0.135693
2019-12-28,430.38,23.460335,4.013931,19.446405,2.137646,89.245957,682376100.0,0.120802
2019-12-29,430.38,23.483757,3.229882,20.253875,2.137646,89.245957,682376100.0,0.113058
2019-12-30,430.38,23.234486,2.384488,20.849997,2.137646,89.245957,682376100.0,0.103
2019-12-31,414.7,21.523581,0.538867,20.984714,4.760333,68.390651,636465900.0,0.055549


In [21]:
# Rerun ML randomforest model on the new data set

# Create train and test sets
training_begin = X.index.min()
training_end = X.index.min() + DateOffset(months=3)

X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

X_test = X.loc[training_end:]
y_test = y.loc[training_end:]

# Scaling data
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)
rf_model = rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)

In [22]:
# Print the classification report for the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.32      0.44      0.37       578
         1.0       0.65      0.52      0.58      1125

    accuracy                           0.50      1703
   macro avg       0.49      0.48      0.48      1703
weighted avg       0.54      0.50      0.51      1703



### Model Evaluation

* We tried to improve the model by incorporating the Vader Sentimental score from daily tweets about Tesla stock from social media platform Twitter, which represents a proxy of market sentiment toward the stock.
* The model does not improve, but performs slightly worse overall.
* The only upside is that it has a higher recall score when predicting price increase, meaning it picks up more correct price increase instances.


## Adding Textblob Sentimental Scores and run ML again

In [23]:
tsla_sentiments_df_textblob.head()

Unnamed: 0_level_0,Subjectivity,Polarity
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01,0.4,-0.15
2015-01-02,0.406539,0.180581
2015-01-03,0.210845,0.046556
2015-01-04,0.201275,0.076732
2015-01-05,0.078016,0.031528


In [24]:
# Shift Polarity and Subjectivity Scores down 1 row to make polarityScore from the previous day
tsla_sentiments_df_textblob= tsla_sentiments_df_textblob.shift(1)

#Concated tsla_sentiments_df_textblob into the main df
df3= pd.concat([df,tsla_sentiments_df_textblob], axis=1)
df3= df3.dropna()

#Separate the y variable (the labels), and X variable (the features)
y= df3['price_direction']
X= df3.drop('price_direction', axis=1)

X.tail()

Unnamed: 0,close_value,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,HL_PCT,RSI,PVT,Subjectivity,Polarity
2019-12-27,430.94,23.0892,4.646278,18.442922,1.654523,90.031012,683669900.0,0.394294,0.111125
2019-12-28,430.38,23.460335,4.013931,19.446405,2.137646,89.245957,682376100.0,0.38415,0.107523
2019-12-29,430.38,23.483757,3.229882,20.253875,2.137646,89.245957,682376100.0,0.386012,0.07987
2019-12-30,430.38,23.234486,2.384488,20.849997,2.137646,89.245957,682376100.0,0.382562,0.072515
2019-12-31,414.7,21.523581,0.538867,20.984714,4.760333,68.390651,636465900.0,0.392603,0.080258


In [25]:
# Rerun ML randomforest model on the new data set

# Create train and test sets
training_begin = X.index.min()
training_end = X.index.min() + DateOffset(months=3)

X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

X_test = X.loc[training_end:]
y_test = y.loc[training_end:]

# Scaling data
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)
rf_model = rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)

In [26]:
# Print the classification report for the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.35      0.15      0.21       578
         1.0       0.66      0.86      0.75      1125

    accuracy                           0.62      1703
   macro avg       0.51      0.50      0.48      1703
weighted avg       0.56      0.62      0.57      1703



### Model Evaluation

* We then used another sentimental analysis tool textblob, and incorporated both sentiment and polarity score in the features data to predict price direction.
* While we only achieve slight improvement overall and have a very low recall score to predicting price fall, this significantly increases recall in predicting price increase, picking up 86% of the actual price increase instances.
* Overall, this model is good for fund managers with a long-only mandate, who are not going to short the stock.


## Using Gradient Boosting Classifier with Textblob Sentimental Scores

In [27]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a random forest classifier
gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=1, random_state=1)
gb_model = gb_model.fit(X_train_scaled, y_train)
y_pred_gb = gb_model.predict(X_test_scaled)

In [29]:
# Print the classification report for the model
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

         0.0       0.35      0.26      0.30       578
         1.0       0.67      0.75      0.71      1125

    accuracy                           0.59      1703
   macro avg       0.51      0.51      0.50      1703
weighted avg       0.56      0.59      0.57      1703



### Model Evaluation

* We changed the ML algorithm to Gradient Boost, on the same data set with Textblob sentiment scores.
* However, this did not significantly improve model performance. 
* The recall of price fall improved, but the recall for price increase suffered. This trade-off probably suggests the limitation of the bagging method and the data used.
