In [1]:
import pandas as pd
from pathlib import Path
import matplotlib as plot

In [2]:
df= pd.read_csv(Path('./Resources/CompanyValues.csv'), index_col='day_date', parse_dates=True, infer_datetime_format=True)
df.head()

Unnamed: 0_level_0,ticker_symbol,close_value,volume,open_value,high_value,low_value
day_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-05-29,AAPL,317.94,38399530,319.25,321.15,316.47
2020-05-28,AAPL,318.25,33449100,316.77,323.44,315.63
2020-05-27,AAPL,318.11,28236270,316.14,318.71,313.09
2020-05-26,AAPL,316.73,31380450,323.5,324.24,316.5
2020-05-22,AAPL,318.89,20450750,315.77,319.23,315.35


In [3]:
#Slice df for TSLA stock data
df= df[df['ticker_symbol'] == 'TSLA']
df= df.sort_index()
df= df.loc['2015-01-01':]

In [4]:
#Create daily_return column
df['daily_return']= df['close_value'].pct_change()

In [5]:
#Set up new column 'price_direction', where 0 if 'daily_return' < 0, 
#and 1 if 'daily_return' > 0. 
df['price_direction']= 0
df.loc[df['daily_return'] >= 0, 'price_direction'] =1

## Adding technical indicators to TSLA dataframe

In [6]:
import pandas_ta as ta

# Calculate MACD values using the pandas_ta library
df.ta.macd(close='close_value', fast=12, slow=26, signal=9, append=True)
# View result
pd.set_option("display.max_columns", None)  # show all columns
print(df)

           ticker_symbol  close_value    volume  open_value  high_value  \
day_date                                                                  
2015-01-01          TSLA       222.41   2392947     223.090      225.68   
2015-01-02          TSLA       219.31   4753239     222.870      223.25   
2015-01-03          TSLA       219.31   4753239     222.870      223.25   
2015-01-04          TSLA       219.31   4753239     222.870      223.25   
2015-01-05          TSLA       210.09   5355485     214.550      216.50   
...                  ...          ...       ...         ...         ...   
2020-05-22          TSLA       816.88   9987475     822.174      831.78   
2020-05-26          TSLA       818.87   8089736     834.500      834.60   
2020-05-27          TSLA       820.23  11549530     820.860      827.71   
2020-05-28          TSLA       805.81   7275774     813.510      824.75   
2020-05-29          TSLA       835.00  11812490     808.750      835.00   

            low_value  d

In [7]:
# Calculate RSI values using the pandas_ta library
df['RSI'] = df.ta.rsi(close= 'close_value', length= 14, scalar= 100)
# View result
pd.set_option("display.max_columns", None)  # show all columns
print(df)

           ticker_symbol  close_value    volume  open_value  high_value  \
day_date                                                                  
2015-01-01          TSLA       222.41   2392947     223.090      225.68   
2015-01-02          TSLA       219.31   4753239     222.870      223.25   
2015-01-03          TSLA       219.31   4753239     222.870      223.25   
2015-01-04          TSLA       219.31   4753239     222.870      223.25   
2015-01-05          TSLA       210.09   5355485     214.550      216.50   
...                  ...          ...       ...         ...         ...   
2020-05-22          TSLA       816.88   9987475     822.174      831.78   
2020-05-26          TSLA       818.87   8089736     834.500      834.60   
2020-05-27          TSLA       820.23  11549530     820.860      827.71   
2020-05-28          TSLA       805.81   7275774     813.510      824.75   
2020-05-29          TSLA       835.00  11812490     808.750      835.00   

            low_value  d

In [8]:
# Calculate VPT values using the pandas_ta library
df['PVT'] = df.ta.pvt(close= 'close_value',volume= 'volume',drift= 1)
# View result
pd.set_option("display.max_columns", None)  # show all columns
print(df)

           ticker_symbol  close_value    volume  open_value  high_value  \
day_date                                                                  
2015-01-01          TSLA       222.41   2392947     223.090      225.68   
2015-01-02          TSLA       219.31   4753239     222.870      223.25   
2015-01-03          TSLA       219.31   4753239     222.870      223.25   
2015-01-04          TSLA       219.31   4753239     222.870      223.25   
2015-01-05          TSLA       210.09   5355485     214.550      216.50   
...                  ...          ...       ...         ...         ...   
2020-05-22          TSLA       816.88   9987475     822.174      831.78   
2020-05-26          TSLA       818.87   8089736     834.500      834.60   
2020-05-27          TSLA       820.23  11549530     820.860      827.71   
2020-05-28          TSLA       805.81   7275774     813.510      824.75   
2020-05-29          TSLA       835.00  11812490     808.750      835.00   

            low_value  d

In [9]:
df=df.drop(['ticker_symbol','daily_return','volume','open_value','high_value','low_value'],
           axis=1)
df.head()

Unnamed: 0_level_0,close_value,price_direction,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,RSI,PVT
day_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-01,222.41,0,,,,,
2015-01-02,219.31,0,,,,,-6625170.0
2015-01-03,219.31,1,,,,,-6625170.0
2015-01-04,219.31,1,,,,,-6625170.0
2015-01-05,210.09,0,,,,,-29140140.0


In [10]:
df= df.dropna()
df.tail()

Unnamed: 0_level_0,close_value,price_direction,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,RSI,PVT
day_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-05-22,816.88,0,37.268723,-3.962823,41.231546,60.098929,3501023000.0
2020-05-26,818.87,1,35.775647,-4.364719,40.140366,60.395145,3502994000.0
2020-05-27,820.23,1,34.306648,-4.666974,38.973623,60.610362,3504912000.0
2020-05-28,805.81,0,31.614451,-5.887337,37.501788,57.069241,3492121000.0
2020-05-29,835.0,1,31.473448,-4.822672,36.29612,61.919353,3534911000.0


In [11]:
df= df.dropna()
df.head()

Unnamed: 0_level_0,close_value,price_direction,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,RSI,PVT
day_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-02-03,218.36,1,0.589223,2.354853,-1.765629,68.180272,-54044970.0
2015-02-04,218.55,1,1.54253,2.646528,-1.103997,68.350684,-53757440.0
2015-02-05,220.99,1,2.466488,2.856388,-0.3899,70.533202,-49829420.0
2015-02-06,217.36,0,2.872706,2.610085,0.262621,63.515795,-55144660.0
2015-02-07,217.36,1,3.158231,2.316488,0.841743,63.515795,-55144660.0


## Create train and test sets

In [12]:
# Separate the data into labels and features

# Separate the y variable, the labels
y= df['price_direction']

# Separate the X variable, the features
X= df.drop('price_direction', axis=1)

In [13]:
y.head()

day_date
2015-02-03    1
2015-02-04    1
2015-02-05    1
2015-02-06    0
2015-02-07    1
Name: price_direction, dtype: int64

In [14]:
X.head()

Unnamed: 0_level_0,close_value,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,RSI,PVT
day_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-02-03,218.36,0.589223,2.354853,-1.765629,68.180272,-54044970.0
2015-02-04,218.55,1.54253,2.646528,-1.103997,68.350684,-53757440.0
2015-02-05,220.99,2.466488,2.856388,-0.3899,70.533202,-49829420.0
2015-02-06,217.36,2.872706,2.610085,0.262621,63.515795,-55144660.0
2015-02-07,217.36,3.158231,2.316488,0.841743,63.515795,-55144660.0


In [15]:
# Check the balance of our target values
y.value_counts()

1    1241
0     655
Name: price_direction, dtype: int64

In [16]:
from pandas.tseries.offsets import DateOffset
# Select the start of the training period
training_begin = X.index.min()

# Display the training begin date
print(training_begin)

2015-02-03 00:00:00


In [17]:
# Select the ending period for the training data with an offset of 3 months
training_end = X.index.min() + DateOffset(months=3)

# Display the training end date
print(training_end)

2015-05-03 00:00:00


In [18]:
# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

# Review the X_train DataFrame
X_train.head()

Unnamed: 0_level_0,close_value,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,RSI,PVT
day_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-02-03,218.36,0.589223,2.354853,-1.765629,68.180272,-54044970.0
2015-02-04,218.55,1.54253,2.646528,-1.103997,68.350684,-53757440.0
2015-02-05,220.99,2.466488,2.856388,-0.3899,70.533202,-49829420.0
2015-02-06,217.36,2.872706,2.610085,0.262621,63.515795,-55144660.0
2015-02-07,217.36,3.158231,2.316488,0.841743,63.515795,-55144660.0


In [19]:
# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end+DateOffset(hours=1):]
y_test = y.loc[training_end+DateOffset(hours=1):]

# Review the X_test DataFrame
X_test.head()

Unnamed: 0_level_0,close_value,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,RSI,PVT
day_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-05-04,230.51,6.799641,0.323449,6.476192,71.735993,-4345468.0
2015-05-05,232.95,6.98401,0.406254,6.577756,73.813015,1775683.0
2015-05-06,230.43,6.847842,0.216069,6.631773,68.235829,-3723830.0
2015-05-07,236.8,7.171269,0.431597,6.739672,73.654701,22371810.0
2015-05-08,236.61,7.327785,0.47049,6.857295,73.253282,21997470.0


## Using ML model to predict price movement

In [20]:
from sklearn.preprocessing import StandardScaler

# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [26]:
X_train_scaled[:5]

array([[ 1.12113331,  0.02989709,  1.59493482, -0.67397056,  1.1096895 ,
         1.30469163],
       [ 1.13722465,  0.3160484 ,  1.83156128, -0.43504752,  1.12219506,
         1.30926226],
       [ 1.34387129,  0.59339016,  2.00181464, -0.17717868,  1.28235739,
         1.37170394],
       [ 1.03644206,  0.7153234 ,  1.80199668,  0.05845441,  0.76739053,
         1.28721033],
       [ 1.03644206,  0.80102846,  1.56381057,  0.26758213,  0.76739053,
         1.28721033]])

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
y_pred = rf_model.predict(X_test_scaled)

In [22]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_pred)

0.5312581348431602

In [23]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, y_pred)

array([[ 105,  519],
       [ 125, 1057]], dtype=int64)

In [24]:
# Print the classification report for the model
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.46      0.17      0.25       624
           1       0.67      0.89      0.77      1182

    accuracy                           0.64      1806
   macro avg       0.56      0.53      0.51      1806
weighted avg       0.60      0.64      0.59      1806

