In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [20]:
combo_df = pd.read_csv('combo_df.csv')
stock_list = pd.read_csv('list_of_stocks.csv')

# clean up the main combo_df index
combo_df = combo_df.rename(columns={'Unnamed: 0':'Date'})
combo_df = combo_df.set_index('Date')

# clean up stock_list index (each row has a unique ticker)
stock_list = stock_list.set_index('Ticker')

# pull 'sector' into the combo_df
combo_df['Sector'] = [stock_list.loc[ticker]['Sector'] for ticker in combo_df['Ticker']]

In [21]:
# dummify the tickers
# combo_df = pd.get_dummies(combo_df, columns=['Ticker'])
# combo_df.head()

In [22]:
combo_df.index

Index(['2021-03-31', '2020-12-31', '2020-09-30', '2020-06-30', '2020-03-31',
       '2019-12-31', '2019-09-30', '2019-06-30', '2019-03-31', '2018-12-31',
       ...
       '2014-03-31', '2013-12-31', '2013-09-30', '2013-06-30', '2013-03-31',
       '2012-12-31', '2012-09-30', '2012-06-30', '2012-03-31', '2011-12-31'],
      dtype='object', name='Date', length=104423)

In [26]:
combo_df['Ticker'].nunique()

2889

In [23]:
start_time = time.time()
# how close can I 'predict' revenues?

X = combo_df.drop(columns=['Sector','Ticker'])
y = combo_df['Sector']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.5)

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

print(f"This code took {round(time.time() - start_time)} seconds to run")

This code took 0 seconds to run


In [24]:
start_time = time.time()

#instantiate and fit the Random Forest model
rf = RandomForestClassifier()
rf.fit(X_train_sc,y_train)

print(f"Random Forest train score: {rf.score(X_train_sc,y_train)}")
print(f"Random Forest test score: {rf.score(X_test_sc,y_test)}")

print(f"This code took {round(time.time() - start_time)} seconds to run")

Random Forest train score: 0.9891210664419375
Random Forest test score: 0.9122423963839731
This code took 23 seconds to run


In [27]:
y_preds = rf.predict(X_test_sc)

In [28]:
y_train_preds =rf.predict(X_train_sc)

In [29]:
pd.DataFrame([y_train_preds,y_train]).T

Unnamed: 0,0,1
0,Industrials,Industrials
1,Financials,Financials
2,Health Care,Health Care
3,Materials,Materials
4,Industrials,Industrials
...,...,...
52206,Consumer Discretionary,Consumer Discretionary
52207,Information Technology,Information Technology
52208,Consumer Staples,Consumer Staples
52209,Financials,Financials


In [30]:
y_experiment = pd.DataFrame([y_preds,y_test]).T
y_experiment

Unnamed: 0,0,1
0,Industrials,Industrials
1,Health Care,Health Care
2,Materials,Materials
3,Utilities,Utilities
4,Consumer Staples,Consumer Staples
...,...,...
52207,Financials,Financials
52208,Financials,Financials
52209,Industrials,Materials
52210,Utilities,Utilities


In [32]:
y_experiment[y_experiment[0]!=y_experiment[1]]

Unnamed: 0,0,1
9,Information Technology,Communication
13,Health Care,Information Technology
25,Health Care,Information Technology
47,Health Care,Real Estate
68,Industrials,Materials
...,...,...
52186,Consumer Discretionary,Energy
52189,Information Technology,Health Care
52190,Industrials,Health Care
52203,Financials,Consumer Staples


In [33]:
y_experiment[y_experiment[0]==y_experiment[1]]

Unnamed: 0,0,1
0,Industrials,Industrials
1,Health Care,Health Care
2,Materials,Materials
3,Utilities,Utilities
4,Consumer Staples,Consumer Staples
...,...,...
52206,Real Estate,Real Estate
52207,Financials,Financials
52208,Financials,Financials
52210,Utilities,Utilities


In [34]:
combo_df['Sector'].value_counts()

Financials                19571
Industrials               16352
Information Technology    15257
Health Care               14775
Consumer Discretionary    13026
Materials                  4959
Real Estate                4724
Energy                     4671
Communication              4382
Consumer Staples           4196
Utilities                  2510
Name: Sector, dtype: int64

In [10]:
start_time = time.time()

#instantiate and fit the Random Forest model
rf = RandomForestRegressor()
rf.fit(X_train_sc,y_train)

print(f"Random Forest train score: {rf.score(X_train_sc,y_train)}")
print(f"Random Forest test score: {rf.score(X_test_sc,y_test)}")

print(f"This code took {round(time.time() - start_time)} seconds to run")

Random Forest train score: 0.9988245366646041
Random Forest test score: 0.9921332119420159
This code took 1058 seconds to run


In [11]:
start_time = time.time()

y_preds = rf.predict(X_test_sc)
print(f"RMSE of {round(np.sqrt(mean_squared_error(y_test,y_preds))/1000000000, 2)} billion")

print(f"This code took {round(time.time() - start_time)} seconds to run")

RMSE of 1.69 billion
This code took 1 seconds to run
