Project: Stock Prediction

Author: Ryder Davidson

### RANDOM FOREST MODEL

***
***
***

In [183]:
import pandas as pd
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

#### Historical Ticker Trends
*considering S&P 500, between 02-28-2020 & 03-01-2022*

##### tdf dataframe:
- remove all columns where the number of NaN values is >= 10% total values
- format 'Date' column to be datetime object of the form '%Y/%m/%d'

##### tdf_shift dataframe:
- originally implemented to analyze daily change in stock values
- utilized in COVID-19 analysis (the use of which has since been abandoned)

##### resources:
- utilized yfinance API to produce ticker data
- https://www.datacamp.com/tutorial/random-forests-classifier-python (model)

In [192]:
tdf = pd.read_csv('ticker_data.csv')
tdf = tdf.loc[:, tdf.isna().sum() <= .1 * len(tdf)]

tdf_shift = (tdf.loc[:, tdf.columns != 'Date'] - tdf.loc[:, tdf.columns != 'Date'].shift(1)).fillna(0)
tdf_shift.drop(tdf_shift[(tdf_shift == 0).all(axis=1)].index, inplace=True)
tdf_shift.insert(loc=0, column='Date', value=tdf.loc[1:, 'Date'])
tdf_shift['Date'] = pd.to_datetime(tdf_shift['Date'], format='%Y/%m/%d')
tdf['Date'] = pd.to_datetime(tdf['Date'], format='%Y/%m/%d')
tdf.drop(tdf.tail(1).index, inplace=True)

drop remaining values that contain NaN values for Random Forest model.

produce resultant list of Ticker symbols, saved in variable `tickers`.

In [195]:
tdf.drop(tdf.loc[:, tdf.isna().sum() > 0].columns, inplace=True, axis=1)
tickers = tdf.loc[:, ~tdf.columns.isin(['Date'])].columns
list(tickers)

['A',
 'AAL',
 'AAP',
 'AAPL',
 'ABBV',
 'ABC',
 'ABMD',
 'ABT',
 'ACN',
 'ADBE',
 'ADI',
 'ADM',
 'ADP',
 'ADSK',
 'AEE',
 'AEP',
 'AES',
 'AFL',
 'AIG',
 'AIZ',
 'AJG',
 'AKAM',
 'ALB',
 'ALGN',
 'ALK',
 'ALL',
 'ALLE',
 'AMAT',
 'AMCR',
 'AMD',
 'AME',
 'AMGN',
 'AMP',
 'AMT',
 'AMZN',
 'ANET',
 'ANSS',
 'ANTM',
 'AON',
 'AOS',
 'APA',
 'APD',
 'APH',
 'APTV',
 'ARE',
 'ATO',
 'ATVI',
 'AVB',
 'AVGO',
 'AVY',
 'AWK',
 'AXP',
 'AZO',
 'BA',
 'BAC',
 'BAX',
 'BBWI',
 'BBY',
 'BDX',
 'BEN',
 'BIIB',
 'BIO',
 'BK',
 'BKNG',
 'BKR',
 'BLK',
 'BLL',
 'BMY',
 'BR',
 'BRO',
 'BSX',
 'BWA',
 'BXP',
 'C',
 'CAG',
 'CAH',
 'CAT',
 'CB',
 'CBOE',
 'CBRE',
 'CCI',
 'CCL',
 'CDAY',
 'CDNS',
 'CDW',
 'CE',
 'CERN',
 'CF',
 'CFG',
 'CHD',
 'CHRW',
 'CHTR',
 'CI',
 'CINF',
 'CL',
 'CLX',
 'CMA',
 'CMCSA',
 'CME',
 'CMG',
 'CMI',
 'CMS',
 'CNC',
 'CNP',
 'COF',
 'COO',
 'COP',
 'COST',
 'CPB',
 'CPRT',
 'CRL',
 'CRM',
 'CSCO',
 'CSX',
 'CTAS',
 'CTLT',
 'CTRA',
 'CTSH',
 'CTVA',
 'CTXS',
 'CVS',
 'CV

In [140]:
tdf

Unnamed: 0,Date,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
0,2020-02-28,75.919319,19.049999,128.992371,67.413651,77.082108,81.689293,150.259995,74.436974,175.267776,...,107.980003,58.653034,45.021351,48.611370,75.592781,86.107697,130.237473,210.970001,37.621601,131.724075
1,2020-03-02,80.746162,18.860001,129.002075,73.689919,79.762123,84.440697,155.100006,78.862808,183.963730,...,107.940002,61.994141,47.156895,49.016140,80.430962,89.272202,133.202866,221.710007,38.892921,137.339890
2,2020-03-03,79.120789,17.850000,126.421844,71.349579,78.754868,83.190933,149.179993,75.364662,176.849747,...,102.440002,61.843555,44.898827,46.607288,78.075401,88.095161,128.046936,209.309998,36.632793,135.688736
3,2020-03-04,83.090637,18.530001,128.914764,74.659103,82.514107,87.560219,154.229996,79.326653,183.294052,...,105.660004,65.128204,45.879066,47.229244,83.392509,91.298286,132.074142,216.039993,36.237267,141.996643
4,2020-03-05,79.258705,16.040001,123.114075,72.237381,81.488846,84.256615,154.509995,77.809502,177.179733,...,95.580002,65.175270,43.857315,44.741417,80.597115,87.718887,123.579735,203.759995,33.995987,137.982513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,2022-02-23,125.802490,16.639999,195.377487,160.070007,145.523178,139.470001,290.679993,116.784531,312.230286,...,87.190002,64.479729,76.769997,55.419998,88.150002,121.589996,119.009712,399.290009,69.709999,187.059998
502,2022-02-24,127.949112,16.959999,199.358505,162.740005,144.045746,137.860001,306.920013,118.248695,315.231384,...,86.480003,64.380424,75.800003,56.279999,87.699997,121.360001,119.883492,415.329987,66.970001,190.949997
503,2022-02-25,133.520370,17.459999,204.084106,164.850006,148.279755,141.919998,312.299988,121.924049,321.552551,...,87.769997,66.763779,77.839996,58.689999,90.930000,123.720001,123.009712,418.070007,70.989998,194.710007
504,2022-02-28,130.155655,17.250000,203.001968,165.119995,146.524689,142.529999,310.739990,120.141151,315.081787,...,86.519997,66.863083,78.419998,54.139999,88.949997,122.580002,123.485435,413.339996,70.889999,193.649994


### General Algorithmic Approach
FOR EACH ticker IN tickers:

* create Random Forest where: 
  * target = boolean indicateing whether ticker's next-day value increased,
  * features = all tickers' current value,
  * test train split is 30:70,
  * 100 estimators in RandomForestClassifier
* determine predictive accuracy of each model
* sort tickers WRT accuracy metric

The resultant list ranks tickers by the degree to which their next-day increase is a function of the feature set *viz.* the current day's values for all tickers. If there is a tight correlation between a ticker's next-day increase and the feature set, then that particular model will possess a higher predictive accuracy score.

In [157]:
accuracy_dict = {}
top_features_dict = {}
for ticker in tickers:
    X = tdf.loc[:, ~tdf.columns.isin(['Date'])]
    Y = tdf[ticker] < tdf[ticker].shift(-1)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    accuracy_dict[ticker] = metrics.accuracy_score(Y_test, Y_pred)

In [158]:
feat_imp_dict

{'A': 0.5986842105263158,
 'AAL': 0.5,
 'AAP': 0.3815789473684211,
 'AAPL': 0.5131578947368421,
 'ABBV': 0.4407894736842105,
 'ABC': 0.4605263157894737,
 'ABMD': 0.46710526315789475,
 'ABT': 0.4276315789473684,
 'ACN': 0.5394736842105263,
 'ADBE': 0.46710526315789475,
 'ADI': 0.5394736842105263,
 'ADM': 0.5526315789473685,
 'ADP': 0.5460526315789473,
 'ADSK': 0.5263157894736842,
 'AEE': 0.5263157894736842,
 'AEP': 0.5921052631578947,
 'AES': 0.5,
 'AFL': 0.5,
 'AIG': 0.5328947368421053,
 'AIZ': 0.506578947368421,
 'AJG': 0.4473684210526316,
 'AKAM': 0.4934210526315789,
 'ALB': 0.4473684210526316,
 'ALGN': 0.5131578947368421,
 'ALK': 0.4934210526315789,
 'ALL': 0.506578947368421,
 'ALLE': 0.5328947368421053,
 'AMAT': 0.5592105263157895,
 'AMCR': 0.48026315789473684,
 'AMD': 0.4934210526315789,
 'AME': 0.5131578947368421,
 'AMGN': 0.5394736842105263,
 'AMP': 0.5131578947368421,
 'AMT': 0.506578947368421,
 'AMZN': 0.5131578947368421,
 'ANET': 0.5,
 'ANSS': 0.5197368421052632,
 'ANTM': 0.5

In [159]:
sorted_accuracy = sorted(feat_imp_dict.items(), key=lambda x: x[1], reverse=True)
sorted_accuracy

[('MA', 0.618421052631579),
 ('AXP', 0.6118421052631579),
 ('TJX', 0.6052631578947368),
 ('A', 0.5986842105263158),
 ('BIO', 0.5986842105263158),
 ('COST', 0.5986842105263158),
 ('XOM', 0.5986842105263158),
 ('AEP', 0.5921052631578947),
 ('GNRC', 0.5855263157894737),
 ('PTC', 0.5855263157894737),
 ('SNA', 0.5855263157894737),
 ('TAP', 0.5855263157894737),
 ('BAC', 0.5789473684210527),
 ('GOOG', 0.5789473684210527),
 ('IP', 0.5789473684210527),
 ('JPM', 0.5789473684210527),
 ('NEM', 0.5789473684210527),
 ('NOW', 0.5789473684210527),
 ('NTRS', 0.5789473684210527),
 ('NXPI', 0.5789473684210527),
 ('PFE', 0.5789473684210527),
 ('PGR', 0.5789473684210527),
 ('ROL', 0.5789473684210527),
 ('WHR', 0.5789473684210527),
 ('BLK', 0.5723684210526315),
 ('BXP', 0.5723684210526315),
 ('HII', 0.5723684210526315),
 ('JCI', 0.5723684210526315),
 ('LNC', 0.5723684210526315),
 ('MTD', 0.5723684210526315),
 ('NTAP', 0.5723684210526315),
 ('PPG', 0.5723684210526315),
 ('PSX', 0.5723684210526315),
 ('UAA', 

`top_tickers` is the 20 tickers with the highest predictive accuracy scores.

In [160]:
top_tickers = [x[0] for x in sorted_accuracy[:20]]
top_tickers

['MA',
 'AXP',
 'TJX',
 'A',
 'BIO',
 'COST',
 'XOM',
 'AEP',
 'GNRC',
 'PTC',
 'SNA',
 'TAP',
 'BAC',
 'GOOG',
 'IP',
 'JPM',
 'NEM',
 'NOW',
 'NTRS',
 'NXPI']

### General Algorithmic Approach
FOR EACH ticker IN top_tickers:

* create Random Forest, where: 
  * target = boolean indicateing whether ticker's next-day value increased,
  * features = all tickers' current value,
  * test train split is 30:70,
  * 100 estimators in RandomForestClassifier
* use this model to determine `temp_feat`, where:
  * `temp_feat` is a list of which features impacted the target, ranked high to low
* repeat step one of this algorithm using the top 20 of `temp_feat` as the feature set

This approach is similar to the first algorithm, however, for each ticker the algorithm determines which features are most pertinent to the predictive accuracy and then fits a model using only those (top 20) features for each target (*i.e.* each `ticker`'s next-day increase).

This refinement – *viz.* only considering the top 20th most pertinent features for those targets which yielded a high predictive accuracy – yeilded an average of 5% increase in predictive accuracy for each model.

In [175]:
for ticker in top_tickers:
    X = tdf.loc[:, ~tdf.columns.isin(['Date'])]
    Y = tdf[ticker] < tdf[ticker].shift(-1)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)

    RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
    
    temp_feat = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)
    
    X = tdf.loc[:, temp_feat.head(10).index]
    Y = tdf[ticker] < tdf[ticker].shift(-1)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    top_features_dict[ticker] = metrics.accuracy_score(Y_test, Y_pred)

top_features_dict

{'MA': 0.6447368421052632,
 'AXP': 0.6381578947368421,
 'TJX': 0.5394736842105263,
 'A': 0.6578947368421053,
 'BIO': 0.618421052631579,
 'COST': 0.625,
 'XOM': 0.5855263157894737,
 'AEP': 0.6052631578947368,
 'GNRC': 0.6513157894736842,
 'PTC': 0.5394736842105263,
 'SNA': 0.5526315789473685,
 'TAP': 0.48026315789473684,
 'BAC': 0.5328947368421053,
 'GOOG': 0.6973684210526315,
 'IP': 0.5526315789473685,
 'JPM': 0.5723684210526315,
 'NEM': 0.5526315789473685,
 'NOW': 0.618421052631579,
 'NTRS': 0.5657894736842105,
 'NXPI': 0.6578947368421053}

In [176]:
sorted_accuracy = sorted(top_features_dict.items(), key=lambda x: x[1], reverse=True)
sorted_accuracy

[('GOOG', 0.6973684210526315),
 ('A', 0.6578947368421053),
 ('NXPI', 0.6578947368421053),
 ('GNRC', 0.6513157894736842),
 ('MA', 0.6447368421052632),
 ('AXP', 0.6381578947368421),
 ('COST', 0.625),
 ('BIO', 0.618421052631579),
 ('NOW', 0.618421052631579),
 ('AEP', 0.6052631578947368),
 ('XOM', 0.5855263157894737),
 ('JPM', 0.5723684210526315),
 ('NTRS', 0.5657894736842105),
 ('SNA', 0.5526315789473685),
 ('IP', 0.5526315789473685),
 ('NEM', 0.5526315789473685),
 ('TJX', 0.5394736842105263),
 ('PTC', 0.5394736842105263),
 ('BAC', 0.5328947368421053),
 ('TAP', 0.48026315789473684)]

Given the aforementioned algorithmic refinement, consider those top 5 tickers which yielded the highest predictive accuracy, and base trades on those. The Random Forest models indicated that GOOG, A, NXPI, GNRC, and MA have the greatest predictive accuracy.

In [198]:
top_tickers = [x[0] for x in sorted_accuracy[:5]]
top_tickers

['GOOG', 'A', 'NXPI', 'GNRC', 'MA']

GOOG, A, NXPI, GNRC, MA (5 shares each)

In [196]:
trade1_tickers = yf.download(list(tickers), start="2022-05-13", end="2022-05-14", interval="1d")['Adj Close']

[*********************100%***********************]  496 of 496 completed

2 Failed downloads:
- DISCK: No data found for this date range, symbol may be delisted
- PBCT: No data found for this date range, symbol may be delisted


In [212]:
trade1_tickers
temp_col = trade1_tickers.loc[:, ~trade1_tickers.columns.isin(['Date'])]
temp_col.fillna(0)

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-05-13,119.379997,16.58,211.210007,147.110001,153.5,152.889999,254.160004,109.879997,286.769989,405.450012,...,66.269997,74.489998,88.860001,38.23,83.910004,112.830002,115.07,336.859985,53.560001,165.330002


In [217]:
daily_prediction = {}
for ticker in top_tickers:
    X = tdf.loc[:, temp_col.columns]
    Y = tdf[ticker] < tdf[ticker].shift(-1)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)

    RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
    
    temp_feat = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)
    
    X = tdf.loc[:, temp_feat.head(10).index]
    Y = tdf[ticker] < tdf[ticker].shift(-1)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(trade1_tickers.loc[:, temp_feat.head(10).index])
    daily_prediction[ticker] = Y_pred.mean()

In [218]:
daily_prediction

{'GOOG': 1.0, 'A': 0.0, 'NXPI': 0.0, 'GNRC': 1.0, 'MA': 0.0}

### NEURAL NETWORK MODEL
***
***
***

In [4]:
import datetime as dt
import yfinance as yf

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from datetime import date

In [5]:
ticker_classifications = {}
sptickers = pd.read_html(
    'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sectors = sptickers['GICS Sector'].unique()
for sector in sectors:
    ticker_classifications[sector] = sptickers.loc[sptickers['GICS Sector'] == sector]['Symbol'].unique()
sectors

array(['Industrials', 'Health Care', 'Information Technology',
       'Communication Services', 'Consumer Staples',
       'Consumer Discretionary', 'Utilities', 'Financials', 'Materials',
       'Real Estate', 'Energy'], dtype=object)

In [7]:
end_day = str(date.today()+dt.timedelta(0))
start_day = str(date.today()-dt.timedelta(10))
energy_finance = yf.download(list(ticker_classifications['Energy']), start=start_day, end=end_day, interval='5m')['Open']
materials_finance = yf.download(list(ticker_classifications['Materials']), start=start_day, end=end_day, interval='5m')['Open']

[*********************100%***********************]  21 of 21 completed
[*********************100%***********************]  28 of 28 completed


In [40]:
shift_energy = (energy_finance.shift(-1)-energy_finance)
shift_energy.drop(shift_energy.tail(3).index, inplace=True)
shift_energy = shift_energy.apply(lambda iterator: ((iterator.max() - iterator)/(iterator.max() - iterator.min())))


shift_materials = materials_finance.shift(-2) >= materials_finance.shift(-1)
shift_materials.drop(shift_materials.tail(2).index, inplace=True)


print(shift_materials.head())
print(shift_energy.head())


report = {}
features = shift_energy.columns
target = shift_materials.columns[0]

X = shift_energy[features].values
y = shift_materials[target].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 50)

mlp = MLPClassifier(hidden_layer_sizes=(32, 16, 8, 8), activation='relu', solver='adam', max_iter=1000)
mlp.fit(X_train,y_train)

predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)

report[target] = classification_report(y_test, predict_test, output_dict=True)['weighted avg']
print(classification_report(y_test, predict_test))
print(report[shift_materials.columns[0]])
print(report[shift_materials.columns[3]])

                             ALB   AMCR    APD    AVY  BALL     CE     CF  \
Datetime                                                                    
2022-05-10 09:30:00-04:00  False  False  False  False  True  False   True   
2022-05-10 09:35:00-04:00  False   True   True  False  True   True  False   
2022-05-10 09:40:00-04:00  False  False  False  False  True  False  False   
2022-05-10 09:45:00-04:00   True  False  False  False  True  False   True   
2022-05-10 09:50:00-04:00   True   True  False   True  True  False   True   

                            CTVA     DD    DOW  ...    MLM    MOS    NEM  \
Datetime                                        ...                        
2022-05-10 09:30:00-04:00   True  False  False  ...  False   True   True   
2022-05-10 09:35:00-04:00  False  False   True  ...   True  False   True   
2022-05-10 09:40:00-04:00  False  False  False  ...   True  False   True   
2022-05-10 09:45:00-04:00   True   True  False  ...   True   True  False   
2022

KeyError: 'AVY'

In [72]:
features = shift_energy.columns
sorted_report = {}

for ticker in list(ticker_classifications['Materials']):
    target = ticker

    X = shift_energy[features].values
    y = shift_materials[target].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 50)

    mlp = MLPClassifier(hidden_layer_sizes=(32, 16, 8, 8), activation='relu', solver='adam', max_iter=1000)
    mlp.fit(X_train,y_train)

    predict_train = mlp.predict(X_train)
    predict_test = mlp.predict(X_test)

    report[target] = classification_report(y_test, predict_test, zero_division=0, output_dict=True)['weighted avg']

In [73]:
report

{'ALB': {'precision': 0.5199576046634871,
  'recall': 0.44919786096256686,
  'f1-score': 0.4009991661862984,
  'support': 187},
 'APD': {'precision': 0.5500559631886581,
  'recall': 0.5614973262032086,
  'f1-score': 0.5463986993398757,
  'support': 187},
 'AMCR': {'precision': 0.5352853511002016,
  'recall': 0.5614973262032086,
  'f1-score': 0.42202203291464563,
  'support': 187},
 'AVY': {'precision': 0.40308455397969467,
  'recall': 0.47593582887700536,
  'f1-score': 0.3195198875834342,
  'support': 187},
 'BALL': {'precision': 0.7517969064458628,
  'recall': 0.5240641711229946,
  'f1-score': 0.36616428676291496,
  'support': 187},
 'CE': {'precision': 0.6121551650963415,
  'recall': 0.5187165775401069,
  'f1-score': 0.38729458604941647,
  'support': 187},
 'CF': {'precision': 0.5187888423182541,
  'recall': 0.5347593582887701,
  'f1-score': 0.38197187374492836,
  'support': 187},
 'CTVA': {'precision': 0.32740427235551484,
  'recall': 0.5721925133689839,
  'f1-score': 0.416493870275

In [75]:
sorted_report_list = sorted(report, key=lambda x:report[x]['precision'], reverse=True)
for ticker in sorted_report_list:
    sorted_report[ticker] = report[ticker]
sorted_report

{'BALL': {'precision': 0.7517969064458628,
  'recall': 0.5240641711229946,
  'f1-score': 0.36616428676291496,
  'support': 187},
 'CE': {'precision': 0.6121551650963415,
  'recall': 0.5187165775401069,
  'f1-score': 0.38729458604941647,
  'support': 187},
 'IFF': {'precision': 0.5641129969774471,
  'recall': 0.42780748663101603,
  'f1-score': 0.2713968314505107,
  'support': 187},
 'LIN': {'precision': 0.5637589836166083,
  'recall': 0.5401069518716578,
  'f1-score': 0.4798230270747146,
  'support': 187},
 'APD': {'precision': 0.5500559631886581,
  'recall': 0.5614973262032086,
  'f1-score': 0.5463986993398757,
  'support': 187},
 'VMC': {'precision': 0.5387973929114311,
  'recall': 0.5026737967914439,
  'f1-score': 0.42440255101158475,
  'support': 187},
 'AMCR': {'precision': 0.5352853511002016,
  'recall': 0.5614973262032086,
  'f1-score': 0.42202203291464563,
  'support': 187},
 'MLM': {'precision': 0.5270744661500965,
  'recall': 0.5133689839572193,
  'f1-score': 0.470233552192695

In [77]:
for ticker in sorted_report_list[:5]:
    target = ticker

    X = shift_energy[features].values
    y = shift_materials[target].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 50)

    mlp = MLPClassifier(hidden_layer_sizes=(32, 16, 8, 8), activation='relu', solver='adam', max_iter=1000)
    mlp.fit(X_train,y_train)

    predict_train = mlp.predict(X_train)
    predict_test = mlp.predict(X_test)

    print(classification_report(y_test, predict_test, zero_division=0))

              precision    recall  f1-score   support

       False       0.53      0.95      0.68        97
        True       0.64      0.10      0.17        90

    accuracy                           0.54       187
   macro avg       0.59      0.52      0.43       187
weighted avg       0.59      0.54      0.44       187

              precision    recall  f1-score   support

       False       0.53      0.54      0.53        94
        True       0.52      0.51      0.51        93

    accuracy                           0.52       187
   macro avg       0.52      0.52      0.52       187
weighted avg       0.52      0.52      0.52       187

              precision    recall  f1-score   support

       False       0.43      1.00      0.60        79
        True       1.00      0.02      0.04       108

    accuracy                           0.43       187
   macro avg       0.71      0.51      0.32       187
weighted avg       0.76      0.43      0.27       187

              preci