In [1]:
import pandas as pd
from sklearn import tree

In [2]:
five_year=pd.read_csv('../five_year.csv')

In [3]:
# Extract Apple stock
five_year=five_year.loc[five_year['ticker'] == 'AAPL']
five_year.head()
#apple_five_year.shape

Unnamed: 0,name,ticker,open,close,adj_close,low,high,volume,market_signal
2,APPLE INC.,AAPL,79.117142,78.432854,56.11887,77.375717,79.285713,140129500,sell
403,APPLE INC.,AAPL,78.26857,77.442856,55.41053,77.285713,78.524284,88241300,sell
803,APPLE INC.,AAPL,76.709999,75.285713,53.867073,75.118568,76.947144,148583400,sell
1202,APPLE INC.,AAPL,74.571426,74.842857,53.550209,73.599998,75.614288,121039100,buy
1601,APPLE INC.,AAPL,75.601425,75.044289,53.694336,74.464287,75.984283,114676800,sell


In [4]:
# Set features. This will be used as X values.
feature=five_year[["open","close","adj_close","low","high","volume"]]
feature_names=feature.columns

# Target values will come from "market_signal" column
target=five_year["market_signal"]


print(feature.shape,target.shape)

(1423, 6) (1423,)


In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(feature,target,random_state=10,stratify=target)

In [6]:
# Scale the data by using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler().fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [7]:
# Decision Tree 
clf=tree.DecisionTreeClassifier()
clf=clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.7303370786516854

In [8]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=200)
rf=rf.fit(X_train,y_train)
rf.score(X_test,y_test)

# Random Forest model is more precise than Decision Tree model. Hyperparameter tuning can be advantageous 
# in creating a model that is better at classification. In the case of a random forest, it may not be 
# necessary, as random forests are already very good at classification. 

0.7668539325842697

In [9]:
# Feature importance, we can see that open price plays the biggest row comparing to other features
sorted(zip(rf.feature_importances_,feature_names),reverse=True)

[(0.21909293979426753, 'open'),
 (0.1859495235221611, 'close'),
 (0.17081534821118843, 'adj_close'),
 (0.14319394188918386, 'low'),
 (0.14295060872658608, 'high'),
 (0.13799763785661298, 'volume')]

In [11]:
# Save the random forest model 
import joblib
filename = '../saved models/DecisionTree_and_RandonForest_5.sav'
joblib.dump(rf, filename)

['../saved models/DecisionTree_and_RandonForest_5.sav']