# Train and Evaluate Models

In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
from collections import Counter

In [3]:
from sklearn.compose import ColumnTransformer

In [4]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [5]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import VarianceThreshold

In [6]:
from sklearn.utils import compute_sample_weight
from sklearn.utils import class_weight

In [7]:
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer, accuracy_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
from sklearn.metrics import classification_report

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

In [9]:
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
from sklearn.metrics import classification_report

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

In [11]:
# Models
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

In [14]:
# Set display and format settings

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.0f}'.format

In [15]:
DATA_ROOT_DIR='/mnt/data/projects/MD7'
PROJ_ROOT_DIR='/home/priyesh/projects/MD7'

### Load Data

In [16]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_complete_flat.pkl')
df_complete_flat = pd.read_pickle(filepath)

In [17]:
filepath=os.path.join(PROJ_ROOT_DIR,'pickle','yahoo_complete.pkl')
df_complete = pd.read_pickle(filepath)

In [18]:
# Take a copy so that we can always refer back to original 

df = df_complete_flat.copy()

In [19]:
# Rename final score to label

df.rename(columns={'FinalScore':'label'},inplace=True)

In [20]:
# Drop column company

df.drop('company',axis=1,inplace=True)

In [21]:
df .head()

Unnamed: 0,industry,0_inc_totalRevenue,0_inc_costOfRevenue,0_inc_grossProfit,0_inc_netIncome,0_inc_operatingIncome,0_inc_netIncomeContinuousOperations,0_inc_netInterestIncome,0_inc_interestIncome,0_inc_otherIncomeExpense,0_inc_operatingExpense,0_inc_totalExpenses,0_inc_taxProvision,0_inc_interestExpense,0_inc_SGA,0_inc_researchAndDevelopment,0_inc_ebit,0_inc_dilutedEPS,0_inc_basicEPS,0_bal_cashEquivalent,0_bal_receivables,0_bal_finishedGoods,0_bal_workInProcess,0_bal_rawMaterials,0_bal_otherCurrentAssets,0_bal_inventory,0_bal_currentAssets,0_bal_netPPE,0_bal_otherNonCurrentAssets,0_bal_financialAssets,0_bal_goodwill,0_bal_goodwillAndOtherIntangibleAssets,0_bal_otherIntangibleAssets,0_bal_nonCurrentAccountsReceivable,0_bal_totalNonCurrentAssets,0_bal_currentDebt,0_bal_payablesAndAccruedExpenses,0_bal_otherCurrentLiabilities,0_bal_currentLiabilities,0_bal_longTermDebt,0_bal_otherNonCurrentLiabilities,0_bal_nonCurrentDeferredLiabilities,0_bal_nonCurrentDeferredTaxesLiabilities,0_bal_longTermProvisions,0_bal_totalNonCurrentLiabilities,0_bal_retainedEarnings,0_bal_stockholdersEquity,0_bal_totalDebt,0_bal_totalAssets,0_bal_totalLiabilities,0_bal_workingCapital,0_bal_quickRatio,0_bal_currentRatio,0_bal_debtCapitalRatio,0_bal_debtAssetRatio,0_cas_depreciation,0_cas_stockBasedCompensation,0_cas_cashFlowOperatingActivities,0_cas_capitalExpenditure,0_cas_saleOfBusiness,0_cas_purchaseOfPPE,0_cas_cashFlowInvestingActivities,0_cas_cashDividendsPaid,0_cas_repurchaseOfCapitalStock,0_cas_cashFlowFinancingActivities,0_cas_freeCashFlow,1_inc_totalRevenue,1_inc_costOfRevenue,1_inc_grossProfit,1_inc_netIncome,1_inc_operatingIncome,1_inc_netIncomeContinuousOperations,1_inc_netInterestIncome,1_inc_interestIncome,1_inc_otherIncomeExpense,1_inc_operatingExpense,1_inc_totalExpenses,1_inc_taxProvision,1_inc_interestExpense,1_inc_SGA,1_inc_researchAndDevelopment,1_inc_ebit,1_inc_dilutedEPS,1_inc_basicEPS,1_bal_cashEquivalent,1_bal_receivables,1_bal_finishedGoods,1_bal_workInProcess,1_bal_rawMaterials,1_bal_otherCurrentAssets,1_bal_inventory,1_bal_currentAssets,1_bal_netPPE,1_bal_otherNonCurrentAssets,1_bal_financialAssets,1_bal_goodwill,1_bal_goodwillAndOtherIntangibleAssets,1_bal_otherIntangibleAssets,1_bal_nonCurrentAccountsReceivable,1_bal_totalNonCurrentAssets,1_bal_currentDebt,1_bal_payablesAndAccruedExpenses,1_bal_otherCurrentLiabilities,1_bal_currentLiabilities,1_bal_longTermDebt,1_bal_otherNonCurrentLiabilities,1_bal_nonCurrentDeferredLiabilities,1_bal_nonCurrentDeferredTaxesLiabilities,1_bal_longTermProvisions,1_bal_totalNonCurrentLiabilities,1_bal_retainedEarnings,1_bal_stockholdersEquity,1_bal_totalDebt,1_bal_totalAssets,1_bal_totalLiabilities,1_bal_workingCapital,1_bal_quickRatio,1_bal_currentRatio,1_bal_debtCapitalRatio,1_bal_debtAssetRatio,1_cas_depreciation,1_cas_stockBasedCompensation,1_cas_cashFlowOperatingActivities,1_cas_capitalExpenditure,1_cas_saleOfBusiness,1_cas_purchaseOfPPE,1_cas_cashFlowInvestingActivities,1_cas_cashDividendsPaid,1_cas_repurchaseOfCapitalStock,1_cas_cashFlowFinancingActivities,1_cas_freeCashFlow,2_inc_totalRevenue,2_inc_costOfRevenue,2_inc_grossProfit,2_inc_netIncome,2_inc_operatingIncome,2_inc_netIncomeContinuousOperations,2_inc_netInterestIncome,2_inc_interestIncome,2_inc_otherIncomeExpense,2_inc_operatingExpense,2_inc_totalExpenses,2_inc_taxProvision,2_inc_interestExpense,2_inc_SGA,2_inc_researchAndDevelopment,2_inc_ebit,2_inc_dilutedEPS,2_inc_basicEPS,2_bal_cashEquivalent,2_bal_receivables,2_bal_finishedGoods,2_bal_workInProcess,2_bal_rawMaterials,2_bal_otherCurrentAssets,2_bal_inventory,2_bal_currentAssets,2_bal_netPPE,2_bal_otherNonCurrentAssets,2_bal_financialAssets,2_bal_goodwill,2_bal_goodwillAndOtherIntangibleAssets,2_bal_otherIntangibleAssets,2_bal_nonCurrentAccountsReceivable,2_bal_totalNonCurrentAssets,2_bal_currentDebt,2_bal_payablesAndAccruedExpenses,2_bal_otherCurrentLiabilities,2_bal_currentLiabilities,2_bal_longTermDebt,2_bal_otherNonCurrentLiabilities,2_bal_nonCurrentDeferredLiabilities,2_bal_nonCurrentDeferredTaxesLiabilities,2_bal_longTermProvisions,2_bal_totalNonCurrentLiabilities,2_bal_retainedEarnings,2_bal_stockholdersEquity,2_bal_totalDebt,2_bal_totalAssets,2_bal_totalLiabilities,2_bal_workingCapital,2_bal_quickRatio,2_bal_currentRatio,2_bal_debtCapitalRatio,2_bal_debtAssetRatio,2_cas_depreciation,2_cas_stockBasedCompensation,2_cas_cashFlowOperatingActivities,2_cas_capitalExpenditure,2_cas_saleOfBusiness,2_cas_purchaseOfPPE,2_cas_cashFlowInvestingActivities,2_cas_cashDividendsPaid,2_cas_repurchaseOfCapitalStock,2_cas_cashFlowFinancingActivities,2_cas_freeCashFlow,3_inc_totalRevenue,3_inc_costOfRevenue,3_inc_grossProfit,3_inc_netIncome,3_inc_operatingIncome,3_inc_netIncomeContinuousOperations,3_inc_netInterestIncome,3_inc_interestIncome,3_inc_otherIncomeExpense,3_inc_operatingExpense,3_inc_totalExpenses,3_inc_taxProvision,3_inc_interestExpense,3_inc_SGA,3_inc_researchAndDevelopment,3_inc_ebit,3_inc_dilutedEPS,3_inc_basicEPS,3_bal_cashEquivalent,3_bal_receivables,3_bal_finishedGoods,3_bal_workInProcess,3_bal_rawMaterials,3_bal_otherCurrentAssets,3_bal_inventory,3_bal_currentAssets,3_bal_netPPE,3_bal_otherNonCurrentAssets,3_bal_financialAssets,3_bal_goodwill,3_bal_goodwillAndOtherIntangibleAssets,3_bal_otherIntangibleAssets,3_bal_nonCurrentAccountsReceivable,3_bal_totalNonCurrentAssets,3_bal_currentDebt,3_bal_payablesAndAccruedExpenses,3_bal_otherCurrentLiabilities,3_bal_currentLiabilities,3_bal_longTermDebt,3_bal_otherNonCurrentLiabilities,3_bal_nonCurrentDeferredLiabilities,3_bal_nonCurrentDeferredTaxesLiabilities,3_bal_longTermProvisions,3_bal_totalNonCurrentLiabilities,3_bal_retainedEarnings,3_bal_stockholdersEquity,3_bal_totalDebt,3_bal_totalAssets,3_bal_totalLiabilities,3_bal_workingCapital,3_bal_quickRatio,3_bal_currentRatio,3_bal_debtCapitalRatio,3_bal_debtAssetRatio,3_cas_depreciation,3_cas_stockBasedCompensation,3_cas_cashFlowOperatingActivities,3_cas_capitalExpenditure,3_cas_saleOfBusiness,3_cas_purchaseOfPPE,3_cas_cashFlowInvestingActivities,3_cas_cashDividendsPaid,3_cas_repurchaseOfCapitalStock,3_cas_cashFlowFinancingActivities,3_cas_freeCashFlow,label
0,Diagnostics & Research,5163000000,2358000000,2805000000,1071000000,941000000,1071000000,-38000000,36000000,16000000,1864000000,4222000000,-152000000,74000000,1460000000,404000000,993000000,3,3,1382000000,930000000,416000000,0,263000000,198000000,679000000,3189000000,850000000,611000000,0,4700000000,4700000000,1107000000,0,6263000000,616000000,794000000,0,2080000000,1791000000,473000000,0,0,0,2624000000,-18000000,4748000000,2407000000,9452000000,4704000000,1109000000,1,1,33,25,238000000,72000000,1021000000,156000000,0,155000000,1590000000,206000000,723000000,299000000,865000000,5339000000,2502000000,2837000000,719000000,846000000,719000000,-70000000,8000000,66000000,1991000000,4493000000,123000000,78000000,1496000000,495000000,920000000,2,2,1441000000,1038000000,417000000,0,303000000,216000000,720000000,3415000000,845000000,776000000,0,4433000000,4433000000,831000000,0,6212000000,75000000,639000000,0,1467000000,2284000000,614000000,0,0,0,3287000000,81000000,4873000000,2359000000,9627000000,4754000000,1948000000,1,2,32,24,308000000,83000000,921000000,119000000,0,119000000,147000000,222000000,469000000,717000000,802000000,6319000000,2912000000,3407000000,1210000000,1347000000,1210000000,-79000000,2000000,92000000,2060000000,4972000000,150000000,81000000,1619000000,441000000,1441000000,3,3,1575000000,1172000000,463000000,0,367000000,222000000,830000000,3799000000,945000000,820000000,0,4956000000,4956000000,981000000,0,6906000000,0,774000000,0,1708000000,2729000000,659000000,0,0,0,3608000000,348000000,5389000000,2729000000,10705000000,5316000000,2091000000,1,2,33,25,321000000,110000000,1485000000,189000000,0,188000000,749000000,236000000,788000000,696000000,1296000000,6848000000,3126000000,3722000000,1254000000,1618000000,1254000000,-75000000,9000000,-39000000,2104000000,5230000000,250000000,84000000,1637000000,467000000,1588000000,4,4,1053000000,1405000000,555000000,0,483000000,282000000,1038000000,3778000000,1100000000,670000000,0,4773000000,4773000000,821000000,0,6738000000,36000000,909000000,0,1861000000,2733000000,536000000,0,0,0,3366000000,324000000,5289000000,2769000000,10516000000,5227000000,1917000000,1,2,34,26,317000000,125000000,1312000000,291000000,0,291000000,338000000,250000000,1139000000,1372000000,1021000000,3
1,Aluminum,10433000000,8537000000,1896000000,-1125000000,876000000,-853000000,-121000000,0,-1193000000,1020000000,9557000000,415000000,121000000,280000000,27000000,-317000000,-6,-6,879000000,660000000,305000000,282000000,611000000,288000000,1644000000,3530000000,7916000000,1414000000,18000000,150000000,202000000,52000000,179000000,11110000000,1000000,1588000000,561000000,2563000000,1799000000,371000000,102000000,0,902000000,6221000000,-555000000,4082000000,1800000000,14640000000,8784000000,967000000,0,1,30,12,713000000,30000000,686000000,379000000,0,0,468000000,0,0,444000000,307000000,9286000000,7969000000,1317000000,-170000000,431000000,-14000000,-146000000,0,-112000000,886000000,8855000000,187000000,146000000,206000000,27000000,319000000,0,0,1607000000,556000000,321000000,112000000,553000000,290000000,1398000000,4520000000,7190000000,1444000000,0,145000000,190000000,45000000,134000000,10344000000,2000000,1494000000,870000000,2761000000,2463000000,515000000,101000000,0,918000000,7112000000,-725000000,3287000000,2465000000,14864000000,9873000000,1759000000,1,1,42,16,653000000,25000000,394000000,353000000,0,0,167000000,0,0,-514000000,41000000,12152000000,9153000000,2999000000,429000000,2077000000,570000000,-195000000,0,-683000000,922000000,10075000000,629000000,195000000,227000000,31000000,1394000000,2,2,1814000000,884000000,538000000,85000000,794000000,358000000,1956000000,5026000000,6623000000,1644000000,7000000,144000000,180000000,36000000,215000000,9977000000,1000000,2048000000,791000000,3223000000,1726000000,599000000,90000000,0,887000000,5531000000,-315000000,4638000000,1727000000,15003000000,8754000000,1803000000,0,1,27,11,664000000,39000000,920000000,390000000,0,0,-565000000,19000000,150000000,1158000000,530000000,12451000000,10212000000,2239000000,-102000000,1331000000,38000000,-106000000,0,-523000000,908000000,11120000000,664000000,106000000,204000000,32000000,808000000,0,0,1363000000,909000000,385000000,350000000,1108000000,417000000,2427000000,5250000000,6493000000,1587000000,2000000,145000000,174000000,29000000,366000000,9533000000,1000000,1987000000,681000000,3004000000,1806000000,486000000,65000000,0,937000000,5207000000,-549000000,5058000000,1807000000,14783000000,8211000000,2246000000,0,1,26,12,617000000,40000000,822000000,480000000,0,0,495000000,72000000,500000000,768000000,342000000,3
2,Airlines,45768000000,35379000000,10389000000,1686000000,3700000000,1686000000,-968000000,127000000,-476000000,6689000000,42068000000,570000000,1095000000,1602000000,0,3351000000,3,3,3826000000,1750000000,0,0,0,0,1851000000,8206000000,43732000000,1237000000,0,4091000000,6175000000,2084000000,0,51789000000,4569000000,5741000000,4808000000,18311000000,28875000000,1453000000,5422000000,0,0,41802000000,2264000000,-118000000,33444000000,59995000000,60113000000,-10105000000,0,0,100,55,2318000000,94000000,3815000000,4268000000,0,0,2243000000,178000000,1097000000,1568000000,-453000000,17337000000,24933000000,-7596000000,-8885000000,-11078000000,-8885000000,-1186000000,41000000,811000000,3482000000,28415000000,-2568000000,1227000000,513000000,0,-10226000000,-18,-18,6864000000,1342000000,0,0,0,0,1614000000,11095000000,39738000000,1816000000,0,4091000000,6120000000,2029000000,0,50913000000,4448000000,5331000000,0,16569000000,36573000000,1502000000,7162000000,0,0,52306000000,-6664000000,-6867000000,41021000000,62008000000,68875000000,-5474000000,0,0,120,66,2370000000,91000000,-6543000000,1958000000,0,0,4342000000,43000000,173000000,-10994000000,-8501000000,29882000000,29855000000,27000000,-1993000000,-5065000000,-1993000000,-1782000000,18000000,4299000000,5092000000,34947000000,-555000000,1800000000,1098000000,0,-748000000,-3,-3,12431000000,1505000000,0,0,0,0,1795000000,17336000000,37362000000,2109000000,0,4091000000,6079000000,1988000000,0,49106000000,3995000000,6027000000,0,19005000000,42157000000,1328000000,6239000000,0,0,54777000000,-8638000000,-7340000000,46152000000,66442000000,73782000000,-1669000000,0,0,118,69,2335000000,98000000,704000000,208000000,5000000,204000000,5983000000,0,18000000,-5288000000,496000000,48971000000,39934000000,9037000000,127000000,1763000000,127000000,-1746000000,216000000,169000000,7274000000,47208000000,59000000,1962000000,1815000000,0,2148000000,0,0,8965000000,2138000000,0,0,0,892000000,2279000000,15269000000,38294000000,1904000000,0,4091000000,6150000000,2059000000,0,49447000000,4739000000,6843000000,0,21496000000,38948000000,1258000000,5976000000,0,0,49019000000,-8511000000,-5799000000,43687000000,64716000000,70515000000,-6227000000,0,0,115,67,2298000000,78000000,2173000000,2906000000,0,360000000,-636000000,0,21000000,2631000000,-733000000,1
3,Specialty Retail,9709003000,5454257000,4254746000,486896000,677180000,486896000,-39898000,0,464000,3577566000,9031823000,150850000,39898000,3577566000,0,677644000,6,6,418665000,689469000,0,0,0,155241000,4432168000,5695543000,3798538000,52448000,0,992240000,1701996000,709756000,0,5552982000,0,3957850000,519852000,4477702000,2764479000,123250000,334013000,334013000,0,3221742000,3772848000,3549081000,2764479000,11248525000,7699444000,1217841000,0,1,43,24,238371000,37438000,866909000,471648000,0,270129000,462939000,17185000,498435000,882153000,395261000,10106321000,5624707000,4481614000,493021000,749907000,493021000,-46886000,0,-52006000,3731707000,9356414000,157994000,46886000,3731707000,0,697901000,7,7,834992000,749999000,0,0,0,146811000,4538199000,6270001000,3842589000,52329000,0,993590000,1674717000,681127000,0,5569635000,0,4247443000,496472000,4743915000,3047483000,146281000,342445000,342445000,0,3536209000,4196634000,3559512000,3047483000,11839636000,8280124000,1526086000,0,1,46,25,250081000,45271000,969688000,267806000,0,267576000,266897000,56347000,469691000,285997000,701882000,10997989000,6069241000,4928748000,616108000,838717000,616108000,-37791000,0,4999000,4090031000,10159272000,189817000,37791000,4090031000,0,843716000,9,9,601428000,782785000,0,0,0,232245000,4659018000,6275476000,4200121000,73651000,0,993744000,1644961000,651217000,0,5918733000,0,4699058000,481249000,5180307000,3371971000,103034000,410606000,410606000,0,3885611000,4605791000,3128291000,3371971000,12194209000,9065918000,1095169000,0,1,51,27,259933000,63067000,1112262000,289639000,0,289639000,287314000,160925000,906208000,1064112000,822623000,11154722000,6192622000,4962100000,501872000,714151000,501872000,-51060000,0,-14404000,4247949000,10440571000,146815000,51060000,4247949000,0,699747000,8,8,269282000,698613000,0,0,0,163695000,4915262000,6046852000,4297829000,62429000,0,990471000,1611372000,620901000,0,5971630000,185000000,4757909000,427480000,5370389000,3466601000,87214000,415997000,415997000,0,3969812000,4744624000,2678281000,3651601000,12018482000,9340201000,676463000,0,1,57,30,283800000,50978000,722222000,425961000,0,424061000,424448000,336230000,618480000,620704000,296261000,2
4,Consumer Electronics,260174000000,161782000000,98392000000,55256000000,63930000000,55256000000,1385000000,4961000000,1807000000,34462000000,196244000000,10481000000,3576000000,18245000000,16217000000,63930000000,2,2,100557000000,45804000000,0,0,0,12352000000,4106000000,162819000000,37378000000,32978000000,0,0,0,0,0,175697000000,16240000000,46236000000,37720000000,105718000000,91807000000,50503000000,0,0,0,142310000000,45898000000,90488000000,108047000000,338516000000,248028000000,57101000000,1,1,54,31,12547000000,6068000000,69391000000,10495000000,0,10495000000,-45896000000,14119000000,66897000000,90976000000,58896000000,274515000000,169559000000,104956000000,57411000000,66288000000,57411000000,890000000,3763000000,803000000,38668000000,208227000000,9680000000,2873000000,19916000000,18752000000,66288000000,3,3,90943000000,37445000000,0,0,0,11264000000,4061000000,143713000000,36766000000,42522000000,0,0,0,0,0,180175000000,13769000000,42296000000,42684000000,105392000000,98667000000,54490000000,0,0,0,153157000000,14966000000,65339000000,112436000000,323888000000,258549000000,38321000000,1,1,63,34,11056000000,6829000000,80674000000,7309000000,0,7309000000,4289000000,14081000000,72358000000,86820000000,73365000000,365817000000,212981000000,152836000000,94680000000,108949000000,94680000000,198000000,2843000000,258000000,43887000000,256868000000,14527000000,2645000000,21973000000,21914000000,108949000000,5,5,62639000000,51506000000,0,0,0,14111000000,6580000000,134836000000,39440000000,48849000000,0,0,0,0,0,216166000000,15613000000,54763000000,47493000000,125481000000,109106000000,53325000000,0,0,0,162431000000,5562000000,63090000000,124719000000,351002000000,287912000000,9355000000,1,1,66,35,11284000000,7906000000,104038000000,11085000000,0,11085000000,14545000000,14467000000,85971000000,93353000000,92953000000,394328000000,223546000000,170782000000,99803000000,119437000000,99803000000,-106000000,2825000000,-334000000,51345000000,274891000000,19300000000,2931000000,25094000000,26251000000,119437000000,6,6,48304000000,60932000000,0,0,0,21223000000,4946000000,135405000000,42117000000,54428000000,0,0,0,0,0,217350000000,21110000000,64115000000,60845000000,153982000000,98959000000,49142000000,0,0,0,148101000000,-3068000000,50672000000,120069000000,352755000000,302083000000,-18577000000,0,0,70,34,11104000000,9038000000,122151000000,10708000000,0,10708000000,22354000000,14841000000,89402000000,110749000000,111443000000,3


### Split Data into Independent and Dependent Columns

In [22]:
X = df.drop('label',axis=1)
y = df['label']

In [23]:
print(X.shape,y.shape)

(728, 261) (728,)


### Train Test Split

In [24]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=66,shuffle=True,stratify=y)

### Handle Class Imbalance

In [25]:
def balance_dataset(X,y):

  df = pd.concat([X,y],axis=1)

  df_class_0 = df[df['label'] == 0]
  df_class_1 = df[df['label'] == 1]
  df_class_2 = df[df['label'] == 2]
  df_class_3 = df[df['label'] == 3]
  df_class_4 = df[df['label'] == 4]

  df_class_0_over = df_class_0.sample(252, replace=True)
  df_class_1_over = df_class_1.sample(252, replace=True)
  df_class_2_over = df_class_2.sample(252, replace=True)
  df_class_4_over = df_class_4.sample(252, replace=True)

  df_temp = pd.concat([df_class_0_over,df_class_1_over,df_class_2_over,df_class_3,df_class_4_over],axis=0)

  new_X = df_temp.drop('label',axis=1)
  new_y = df_temp['label']

  return new_X, new_y

### Cap Outliers

In [None]:
def cap_ext_outliers(train_X,test_X):

  df_train = train_X.copy()
  df_test = test_X.copy()
    
  cols = df_train.columns
  cols = list(cols)

  for feature in cols:
    IQR = X_train[feature].quantile(0.75)- X_train[feature].quantile(0.25)
    ext_lower_bound = df_train[feature].quantile(0.75) - (IQR*3)
    ext_upper_bound = df_train[feature].quantile(0.75) + (IQR*3) 
      
    df_train[feature] = np.where(df_train[feature] < ext_lower_bound, ext_lower_bound, df_train[feature])
    df_train[feature] = np.where(df_train[feature] > ext_upper_bound, ext_upper_bound, df_train[feature])  

    df_test[feature] = np.where(df_test[feature] < ext_lower_bound, ext_lower_bound, df_test[feature])
    df_test[feature] = np.where(df_test[feature] > ext_upper_bound, ext_upper_bound, df_test[feature])  

  return df_train,df_test


In [46]:
def cap_outliers(train_X,test_X):

  df_train = train_X.copy()
  df_test = test_X.copy()
    
  cols = df_train.columns
  cols = list(cols)

  for feature in cols:
    IQR = X_train[feature].quantile(0.75)- X_train[feature].quantile(0.25)
    ext_lower_bound = df_train[feature].quantile(0.75) - (IQR*1.5)
    ext_upper_bound = df_train[feature].quantile(0.75) + (IQR*1.5) 
      
    df_train[feature] = np.where(df_train[feature] < ext_lower_bound, ext_lower_bound, df_train[feature])
    df_train[feature] = np.where(df_train[feature] > ext_upper_bound, ext_upper_bound, df_train[feature])  

    df_test[feature] = np.where(df_test[feature] < ext_lower_bound, ext_lower_bound, df_test[feature])
    df_test[feature] = np.where(df_test[feature] > ext_upper_bound, ext_upper_bound, df_test[feature])  

  return df_train,df_test

In [27]:
class cap_outlier_ext(BaseEstimator,TransformerMixin):
    
    def __init__(self, factor=3.0):
        self.factor = factor
        
    def outliers_iqr(self, X, y=None):
        X = pd.Series(X).copy()
        q1 = X.quantile(0.25)
        q3 = X.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (iqr * self.factor)
        upper_bound = q3 + (iqr * self.factor)
        # filter only those rows that are greater than lower_bound and less than upper_bound, 
        #i.e. drop values outside the given interval
        X.loc[X < lower_bound] = lower_bound
        X.loc[X > upper_bound] = upper_bound
        
        return pd.Series(X)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.apply(self.outliers_iqr)


In [48]:
class cap_outlier(BaseEstimator,TransformerMixin):
    
    def __init__(self, factor=1.5):
        self.factor = factor
        
    def outliers_iqr(self, X, y=None):
        X = pd.Series(X).copy()
        q1 = X.quantile(0.25)
        q3 = X.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (iqr * self.factor)
        upper_bound = q3 + (iqr * self.factor)
        # filter only those rows that are greater than lower_bound and less than upper_bound, 
        #i.e. drop values outside the given interval
        X.loc[X < lower_bound] = lower_bound
        X.loc[X > upper_bound] = upper_bound
        
        return pd.Series(X)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.apply(self.outliers_iqr)


In [50]:
# Cap outliers

l = X_train.shape[1]

ctrf_cap_outliers = ColumnTransformer(
                    [('capOutlier',cap_outlier(), slice(0,l))
                    ],remainder='passthrough') 


### Corelated Feaures

In [None]:
# Select highly correlated features

def correlation(dataset, threshold):
  col_corr = set()
  corr_matrix = dataset.corr()
  for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i,j] > threshold:
          colname = corr_matrix.columns[i]
          col_corr.add(colname)
  return col_corr

In [None]:
col_corr = correlation(X_train, 90)
col_corr

In [28]:
class removeCorrFeatures(BaseEstimator,TransformerMixin):
    
    def __init__(self, threshold):
        self.threshold = threshold
        self.correlated_features = None

    def fit(self, X, y=None):
        col_corr = set() 
        X = pd.DataFrame(X)
        corr_matrix = X.corr()
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if corr_matrix.iloc[i, j] > self.threshold: # we are interested in absolute coeff value
                    colname = corr_matrix.columns[i]  # getting the name of column
                    col_corr.add(colname)
        self.correlated_features = col_corr
        return self

    def transform(self, X, y=None, **kwargs):
        return (pd.DataFrame(X)).drop(self.correlated_features, axis=1)


### Model Selection

In [31]:
def train_eval_model(model,train_X,train_y,test_X,test_y,use_sample_weights=False):

  if use_sample_weights:
    class_weights = class_weight.compute_sample_weight(
            class_weight='balanced',
            y=train_y)
    model.fit(train_X, train_y,sample_weight = class_weights)
  else:
    model.fit(train_X, train_y)  
      
  y_pred = model.predict(test_X)
  y_train_predict = model.predict(train_X)

  train_accuracy = round(accuracy_score(train_y, y_train_predict),3)
  test_accuracy = round(accuracy_score(test_y, y_pred),3)

  train_f1 = round(f1_score(train_y, y_train_predict,zero_division=np.nan, average='weighted'),3)
  test_f1 = round(f1_score(test_y, y_pred,zero_division=np.nan, average='weighted'),3)
    
  print('Train accuracy',train_accuracy)
  print('Test accuracy',test_accuracy) 
  print
    
  print('Train F1 Score',train_f1)
  print('Test F1 Score',test_f1) 
  print("")

  cross_val_accuracy = round(np.mean(cross_val_score(model,train_X,train_y,scoring='accuracy',cv=5)),3)
 
  cross_val_f1 = round(np.mean(cross_val_score(model,train_X,train_y,
                                         scoring=make_scorer(f1_score, average='weighted'),
                                         cv=5)),3)
  
  print('Cross val accuracy:',cross_val_accuracy)
  print('Cross val f1:',cross_val_f1)
  print("")
  print(pd.crosstab(test_y,y_pred))
  print("")
  print(classification_report(test_y, y_pred,zero_division=np.nan))


## Baseline Evaluation 

Default parameters, no modification to dataset.

In [None]:
# Drop Industry

X_train = X_train.drop('industry',axis=1)
X_test = X_test.drop('industry',axis=1)

### XGBoost

In [None]:
model = xgb.XGBClassifier(
                       random_state=0)      

train_eval_model(model, X_train, y_train, X_test, y_test)


In [None]:
model = xgb.XGBClassifier(
                       random_state=50)      

train_eval_model(model, X_train, y_train, X_test, y_test,use_sample_weights=True)

### Adaboost

In [None]:
model = AdaBoostClassifier(n_estimators=200,algorithm='SAMME')
train_eval_model(model, X_train, y_train, X_test, y_test)

In [None]:
model = AdaBoostClassifier(n_estimators=200,algorithm='SAMME.R')
train_eval_model(model, X_train, y_train, X_test, y_test)

### Random Forest

In [None]:
model = RandomForestClassifier(random_state=0)
train_eval_model(model, X_train, y_train, X_test, y_test)

### Gradient Boosting

In [None]:
model = RandomForestClassifier(random_state=0)
train_eval_model(model, X_train, y_train, X_test, y_test,use_sample_weights=True)

### Hyperparameter Tuning

In [None]:
X_train, y_train = balance_dataset(X_train, y_train)
X_train.shape

In [None]:
counter = Counter(y_train)
for k, v in counter.items():
  dist = v / len(y_train) * 100
  print(f"Class={k}, n={v} ({dist}%)")

In [None]:
ctrf_cap_outliers

### XGBoost

In [None]:
clf = xgb.XGBClassifier(random_state=0)     

pipe = Pipeline(
    steps=[("capOutliers",ctrf_cap_outliers), 
           ("removeCorrFeatures",removeCorrFeatures(threshold=0.90)),
           ("select", SelectKBest(score_func=mutual_info_classif, k=100)),    
           ("classifier",clf)])    

param_grid = {
    "classifier__max_depth": [2,3,4,5,6,7],
    "classifier__n_estimators":[50,70,90,110,150,200,250,300,250,400],
    "classifier__learning_rate":[0.05,0.07,0.09,0.11,0.15],
    "classifier__gamma":[0.01,0.1,1,2,3,4,5,6,7]
}
           
gridSearch = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='f1_macro',
    verbose=10,
    n_jobs=-1,
    cv=5
)

gridSearch = GridSearchCV(pipe,param_grid=param_grid, n_jobs=-1)
gridSearch.fit(X_train,y_train)

In [None]:
gridSearch.best_score_

In [None]:
gridSearch.best_params_

In [None]:
clf = xgb.XGBClassifier(random_state=50,
                          max_depth = 7,
                          learning_rate = 0.15,
                          gamma = 0.1,
                          n_estimators = 70)   

pipe = Pipeline(
    steps=[("capOutliers",ctrf_cap_outliers), 
           ("removeCorrFeatures",removeCorrFeatures(threshold=0.90)),
           ("select", SelectKBest(score_func=mutual_info_classif, k=100)),    
           ("classifier",clf)])    

train_eval_model(model, X_train, y_train, X_test, y_test)

### Random Forest

In [None]:
clf=RandomForestClassifier(random_state=0)

pipe = Pipeline(
    steps=[("capOutliers",ctrf_cap_outliers), 
           ("removeCorrFeatures",removeCorrFeatures(threshold=0.90)),
           ("select", SelectKBest(score_func=mutual_info_classif, k=100)),    
           ("classifier",clf)])    

param_grid = {
    "classifier__criterion":["gini","entropy"],
    "classifier__max_depth": [2,3,4,5,6],
    "classifier__n_estimators":[50,70,90,110,150,200,250,300,250,400],
}
           
gridSearch = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='f1_macro',
    verbose=10,
    n_jobs=-1,
    cv=5
)

gridSearch = GridSearchCV(pipe,param_grid=param_grid, n_jobs=-1)
gridSearch.fit(X_train,y_train)

In [None]:
gridSearch.best_score_

In [None]:
gridSearch.best_params_

In [None]:
clf=RandomForestClassifier(random_state=0,
                           max_depth = 6,
                           criterion = 'entropy',
                           n_estimators = 300)

pipe = Pipeline(
    steps=[("capOutliers",ctrf_cap_outliers),
           ("removeCorrFeatures",removeCorrFeatures(threshold=0.90)),
           ("select", SelectKBest(score_func=mutual_info_classif, k=100)),    
           ("classifier", RandomForestClassifier()),]
)
train_eval_model(pipe, X_train, y_train, X_test, y_test)

### XGBoost

In [None]:

pipe = Pipeline(
    steps=[("capOutliers",ctrf_cap_outliers),
           ("removeCorrFeatures",removeCorrFeatures(threshold=0.90)),
           ("select", SelectKBest(score_func=mutual_info_classif, k=100)),    
           ("classifier",  xgb.XGBClassifier()),]
)
train_eval_model(pipe, X_train, y_train, X_test, y_test)

## Optimised Models

In [66]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=66,shuffle=True,stratify=y)

In [67]:
# Handle Imblanced Dataset 

# Resampling doesn't improve performance so comment out.

#X_train, y_train = balance_dataset(X_train,y_train)

In [68]:
# Drop Industry

X_train = X_train.drop('industry',axis=1)
X_test = X_test.drop('industry',axis=1)

### XGBoost

In [69]:
clf = xgb.XGBClassifier(
                       max_depth = 3,
                       n_estimators = 120,
                       learning_rate = 0.14,
                       gamma = 0.01,
                       reg_lambda = 0.5,
                       reg_alpha = 0.6,
                       min_child_weight=5,
                       random_state=100)      

pipe = Pipeline(
    steps=[("capOutliers",ctrf_cap_outliers),
           ("removeCorrFeatures",removeCorrFeatures(threshold=0.90)),
           ("classifier", clf),]
)

train_eval_model(pipe, X_train, y_train, X_test, y_test)

Train accuracy 1.0
Test accuracy 0.712
Train F1 Score 1.0
Test F1 Score 0.705

Cross val accuracy: 0.637
Cross val f1: 0.624

col_0  0   1   2   3  4
label                  
0      4   1   0   0  0
1      1  12   4   0  0
2      0   2  37  16  0
3      0   0  12  50  1
4      0   0   0   5  1

              precision    recall  f1-score   support

           0       0.80      0.80      0.80         5
           1       0.80      0.71      0.75        17
           2       0.70      0.67      0.69        55
           3       0.70      0.79      0.75        63
           4       0.50      0.17      0.25         6

    accuracy                           0.71       146
   macro avg       0.70      0.63      0.65       146
weighted avg       0.71      0.71      0.71       146



### Gradient Boost

In [70]:
clf=GradientBoostingClassifier( n_estimators = 75,
                                  max_depth = 3,
                                  learning_rate = 0.16 )


pipe = Pipeline(
    steps=[("capOutliers",ctrf_cap_outliers),
           ("removeCorrFeatures",removeCorrFeatures(threshold=0.90)),
           ("classifier", clf),]
)

train_eval_model(pipe, X_train, y_train, X_test, y_test)

Train accuracy 1.0
Test accuracy 0.712
Train F1 Score 1.0
Test F1 Score 0.728

Cross val accuracy: 0.653
Cross val f1: 0.639

col_0  0   1   2   3  4
label                  
0      5   0   0   0  0
1      1  10   6   0  0
2      1   1  38  15  0
3      1   0  10  51  1
4      0   0   0   6  0

              precision    recall  f1-score   support

           0       0.62      1.00      0.77         5
           1       0.91      0.59      0.71        17
           2       0.70      0.69      0.70        55
           3       0.71      0.81      0.76        63
           4       0.00      0.00       nan         6

    accuracy                           0.71       146
   macro avg       0.59      0.62      0.73       146
weighted avg       0.70      0.71      0.73       146



### Random Forest

In [71]:
clf=RandomForestClassifier(n_estimators = 100,
                             max_depth = 6,
                             random_state=0)

pipe = Pipeline(
    steps=[("capOutliers",ctrf_cap_outliers),
           ("removeCorrFeatures",removeCorrFeatures(threshold=0.90)),
           ("classifier", clf),]
)

train_eval_model(pipe, X_train, y_train, X_test, y_test)

Train accuracy 0.875
Test accuracy 0.705
Train F1 Score 0.859
Test F1 Score 0.713

Cross val accuracy: 0.655
Cross val f1: 0.635

col_0  0  1   2   3
label              
0      5  0   0   0
1      2  9   6   0
2      0  2  34  19
3      0  0   8  55
4      0  0   0   6

              precision    recall  f1-score   support

           0       0.71      1.00      0.83         5
           1       0.82      0.53      0.64        17
           2       0.71      0.62      0.66        55
           3       0.69      0.87      0.77        63
           4        nan      0.00       nan         6

    accuracy                           0.71       146
   macro avg       0.73      0.60      0.73       146
weighted avg       0.71      0.71      0.71       146



Need to do more work to deal with bias-variance problem. Tried early stopping but that didn't make much difference
to performance. Feature selection doesn't improve performance, so it was removed.