Use case: Predicting the close prices of stocks using the Open, High, Low and Volume of each stock.
Dataset source: https://www.kaggle.com/mysarahmadbhat/stock-prices

In [76]:
#Import necessary libraries
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [77]:
# Read original data and set symbol column as the index
data = pd.read_csv('stock_prices.csv', index_col='symbol')
data.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume'], dtype='object')

In [78]:
#define score tester function to test different models of Random Forest Regressor
def score_tester(model, X_train, X_valid, y_train,y_valid):
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    mae_score = mean_absolute_error(y_valid, preds)
    return mae_score

In [79]:
#Split the original dataset into training and testing data
training_data = data.iloc[:246116]
y = training_data.close
test_data = data.iloc[246117:]
#drop 'close' column because that is our target value and drop 'date' column because it is not a continous value to use in predicting
drop_features = ['date', 'close']

In [80]:
#Check which values in training data contain null/NA values
missing_values_in_training_data = training_data.isnull().sum()
print(missing_values_in_training_data)

date      0
open      6
high      3
low       3
close     0
volume    0
dtype: int64


In [81]:
#Check for NA/null values in testing data 
missing_values_in_test_data = test_data.isnull().sum()
print(missing_values_in_test_data)

date      0
open      5
high      5
low       5
close     0
volume    0
dtype: int64


In [82]:
#Drop features/columns not needed/helpful for prediction
training_data = training_data.drop(drop_features, axis=1)

In [83]:
#Split training_data into train data for training model and valid data to check the accuracy of the model
X_train, X_valid, y_train, y_valid = train_test_split(training_data,y, train_size = 0.8, test_size=0.2, random_state=0)

In [84]:
#Use SimpleImputer method to fill in all empty/NA values with mean values
my_imputer = SimpleImputer()
imputated_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputated_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

imputated_X_train.columns = X_train.columns
imputated_X_valid.columns = X_valid.columns

In [85]:
#Creating different RandomForestRegressor models with different parameters. The goal is to see which model is the best to use for prediction.
rfr_model_1 = RandomForestRegressor(n_estimators = 150, random_state = 0)
rfr_model_2 = RandomForestRegressor(max_depth = 15, random_state = 0, criterion='squared_error')
rfr_model_3 = RandomForestRegressor(min_impurity_decrease = 0.6, random_state = 0, criterion='poisson')
rfr_model_4 = RandomForestRegressor(max_features = 'sqrt', random_state = 0)
rfr_model_5 = RandomForestRegressor(n_estimators = 400, random_state = 0)

rfr_models = [rfr_model_1, rfr_model_2, rfr_model_3, rfr_model_4, rfr_model_5]

In [86]:
#Pass each model to the score_tester function with the necessary train and valid data to check the accuracy of each model
count = 1
for model in rfr_models:
    score = score_tester(model, imputated_X_train, imputated_X_valid, y_train, y_valid)
    print("Model ",count,"\t Mean Absolute Error score:", score)
    count +=1

Model  1 	 Mean Absolute Error score: 0.3698874203775933
Model  2 	 Mean Absolute Error score: 0.36608309243757786
Model  3 	 Mean Absolute Error score: 44.0757026459739
Model  4 	 Mean Absolute Error score: 0.3726542569275146
Model  5 	 Mean Absolute Error score: 0.36931374382415216


In [87]:
#Separate the actual 'close' values we will like to predict from the testing data
actual_values = test_data.close
test_data = test_data.drop(drop_features, axis=1)

In [88]:
#Replace all NA/null values in test data with mean values
my_imputer = SimpleImputer()
imputated_test_data = pd.DataFrame(my_imputer.fit_transform(test_data))

imputated_test_data.columns = test_data.columns

In [89]:
#Predict the 'close' values using the test data with replaced NA values
predictions = rfr_model_2.predict(imputated_test_data)

In [90]:
combined = {'Actual values': actual_values, 'predictions': predictions}
output_df = pd.DataFrame(combined)
output_df.to_csv('output.csv', encoding='utf-8')
print(output_df)

        Actual values  predictions
symbol                            
AAPL           105.35   104.103881
AAP            152.24   152.143892
ABBV            57.61    57.000162
ABC            101.87   102.015560
ABT             42.93    42.948886
...               ...          ...
XYL             68.20    68.344624
YUM             81.61    82.060490
ZBH            120.67   121.243403
ZION            50.83    51.124386
ZTS             72.04    72.371621

[251355 rows x 2 columns]


In [91]:
#import pickle library to serialize model object
import pickle

In [92]:
#save the best model to a pickle file
with open('stock_price_model', 'wb') as f:
    pickle.dump(rfr_model_2, f)

In [93]:
#Reload the pickle file as a model object
with open('stock_price_model', 'rb') as f:
    mp = pickle.load(f)

In [94]:
#Test if model makes same prediction after being serialized
mp.predict(imputated_test_data)

array([104.10388105, 152.14389185,  57.00016194, ..., 121.24340284,
        51.12438582,  72.37162085])