In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix,accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from yellowbrick.target import FeatureCorrelation
from cuml import RandomForestClassifier as cuRF
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
plt.figure(figsize = (20, 18))

In [None]:
oct_data = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/train.csv")
oct_data.shape

#### My learning in this competition as the kernel kept crashing.I started using the below code snippet to reduce the memory usage

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
oct_data = reduce_mem_usage(oct_data)

In [None]:
oct_data.head()

In [None]:

oct_data.groupby(['target']).size()

#### Will manually do feature selection with Yellowbricks

In [None]:
oct_data.groupby(['target']).size().isnull().sum() # no null columns

In [None]:
oct_data.nunique()

In [None]:
oct_data.isnull().sum()

Ten columns at a time subsetting and applying yellowbricks package to get the important features

In [None]:
df = oct_data[['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9','f10','target']]

In [None]:
df.plot.scatter(x='f1', y='f10',c='target',colormap='viridis',figsize=(20, 18));

In [None]:
df.describe().transpose().round(2)

In [None]:
df.info()

In [None]:
#checking for correlation
pearson_corr = df.corr(method='pearson')

pearson_corr

In [None]:
sns.heatmap(pearson_corr, 
            linewidth=1, 
            annot=True, 
            annot_kws={'size' : 10} )

plt.title('Pearson correlations', fontsize=25)

plt.show()

In [None]:
sns.countplot(data=df,x='target')

In [None]:
target = df['target']

features = df.drop('target', axis=1)

# Feature selection with Yellowbrick

In [None]:
feature_names = list(features.columns)

In [None]:


figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

From the above plot we can see f1, f2, f3, f4, f8 are the important features

In [None]:
df = oct_data[['f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19','f20','target']]

In [None]:
target = df['target']

features = df.drop('target', axis=1)

In [None]:
feature_names = list(features.columns)

In [None]:
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

The important features are f12, f13, f14, f16,f17,f18,f19,f20

In [None]:
df = oct_data[['f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29','f30','target']]

In [None]:
target = df['target']
features = df.drop('target', axis=1)

feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)

visualizer = FeatureCorrelation(labels = feature_names, method='pearson')
visualizer.fit(features, target)
visualizer.poof()

f22 is the feature to consider for model training

In [None]:
df = oct_data[['f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39','f40','target']]

In [None]:
target = df['target']
features = df.drop('target', axis=1)

feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)

visualizer = FeatureCorrelation(labels = feature_names, method='pearson')
visualizer.fit(features, target)
visualizer.poof()

f31,f32,f33,f34,f35,f36,f39,f40 are the features

In [None]:
df = oct_data[['f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49','f50','target']]

In [None]:
target = df['target']
features = df.drop('target', axis=1)

feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)


visualizer = FeatureCorrelation(labels = feature_names, method='pearson')
visualizer.fit(features, target)
visualizer.poof()

f42,f43,f44,f48 are the features

In [None]:
df = oct_data[['f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59','f60','target']]

In [None]:
target = df['target']
features = df.drop('target', axis=1)

feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)

visualizer = FeatureCorrelation(labels = feature_names, method='pearson')
visualizer.fit(features, target)
visualizer.poof()

f52, f56,f58 are the features

In [None]:
df = oct_data[['f61', 'f62','f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69','f70','target']]
target = df['target']
features = df.drop('target', axis=1)

feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)

visualizer = FeatureCorrelation(labels = feature_names, method='pearson')
visualizer.fit(features, target)
visualizer.poof()


f63, f69

In [None]:
df = oct_data[['f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79','f80','target']]

target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()


f72,f73,f74,f75,f77,f78 features

In [None]:
df = oct_data[['f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89','f90','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f82,f83,f85,f86,f87,f89,f90 features

In [None]:
df = oct_data[['f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99','f100','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f92, f93, f95, f96, f98, f99 features

In [None]:
df = oct_data[['f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109','f110','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f103, f107, f108 features

In [None]:
df = oct_data[['f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119','f120','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f112, f117, f119 features

In [None]:
df = oct_data[['f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129','f130','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f123, f125,f126, f127,f128, f129, f130 features

In [None]:
df = oct_data[['f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139','f140','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f134, f136

In [None]:
df = oct_data[['f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149','f150','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f141, f143, f144, f146, f147, f150 features

In [None]:
df.head()

In [None]:
df = oct_data[['f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159','f160','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f152, f154, f156 features

In [None]:
df.head()

In [None]:
df = oct_data[['f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169','f170','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f161, f162, f163, f164, f169, f170 features

In [None]:
df.head()

In [None]:
df = oct_data[['f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179','f180','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f179 feature

In [None]:
df.head()

In [None]:
df = oct_data[['f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189','f190','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f181, f182, f183,f184,f187,f188,f189

In [None]:
df.head()

In [None]:
df = oct_data[['f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199','f200','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f191, f192, f193,f195, f199,f200 features

In [None]:
df.head()

In [None]:
df = oct_data[['f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209','f210','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f201, f206, f208, f210 features

In [None]:
df.head()

In [None]:
df = oct_data[['f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219','f220','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f211,  f213, f214 features

In [None]:
df.head()

In [None]:
df = oct_data[['f221', 'f222', 'f223', 'f224', 'f225', 'f226', 'f227', 'f228', 'f229','f230','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f222,f224,f225,f226,f227,f229,f230

In [None]:
df.head()

In [None]:
df = oct_data[['f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239','f240','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f231, f232, f239, f240 features

In [None]:
df.head()

In [None]:
df = oct_data[['f241', 'f242', 'f243', 'f244', 'f245', 'f246', 'f247', 'f248', 'f249','f250','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

241, 243, 247 features. Categorical features

In [None]:
df.head()

In [None]:
df = oct_data[['f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257', 'f258', 'f259','f260','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f252, f254, f255, 256, f258, f259, f260 features. Categorical features

In [None]:
df.head()

In [None]:
df = oct_data[['f261', 'f262', 'f263', 'f264', 'f265', 'f266', 'f267', 'f268', 'f269','f270','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f261, f265, f266, f267, f269 categorical features 

In [None]:
df.head()

In [None]:
df = oct_data[['f271', 'f272', 'f273', 'f274', 'f275', 'f276', 'f277', 'f278', 'f279','f280','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f271, f273, f274, f275, f276, f277,f278,f279,f280 categorical features

In [None]:
df.head()

In [None]:
df = oct_data[['f281', 'f282', 'f283', 'f284','target']]
target = df['target']

features = df.drop('target', axis=1)
feature_names = list(features.columns)
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

f281, f282, f283,f284 categorical features

In [None]:
df.head()

Manual data selection completed and we will train the model with the selected features

In [None]:
df=oct_data[['f1', 'f2', 'f3', 'f4', 'f8','f12', 'f13', 'f14', 'f16','f17','f18','f19','f20',
'f22','f31','f32','f33','f34','f35','f36','f39','f40',
'f42','f43','f44','f48','f52', 'f56','f58','f63','f69',
'f72','f73','f74','f75','f77','f78','f82','f83','f85','f86','f87','f89','f90',
'f92', 'f93', 'f95', 'f96', 'f98', 'f99','f103', 'f107', 'f108',
'f112', 'f117', 'f119','f123', 'f125','f126', 'f127','f128', 'f129', 'f130',
'f134', 'f136','f141', 'f143', 'f144', 'f146', 'f147', 'f150',
'f152', 'f154', 'f156' ,'f161', 'f162', 'f163', 'f164', 'f169', 'f170',
'f179','f181', 'f182', 'f183','f184','f187','f188','f189',
'f191', 'f192', 'f193','f195', 'f199','f200','f201', 'f206', 'f208', 'f210',
'f211', 'f213', 'f214','f222','f224','f225','f226','f227','f229','f230',
'f231', 'f232', 'f239', 'f240','f241', 'f243', 'f247','f252', 'f254', 'f255', 'f256', 'f258', 'f259', 'f260','f261', 'f265', 'f266', 'f267', 'f269',
'f271', 'f273', 'f274', 'f275', 'f276', 'f277','f278','f279','f280','f281', 'f282', 'f283','f284','target']]

In [None]:
df.shape

In [None]:
df.head()

In [None]:
del oct_data

In [None]:
#numeric columns
df_numeric_cols = df[['f1', 'f2', 'f3', 'f4', 'f8','f12', 'f13', 'f14', 'f16','f17','f18','f19','f20',
'f31','f32','f33','f34','f35','f36','f39','f40',
'f42','f44','f48','f52', 'f56','f58','f63','f69',
'f72','f73','f74','f75','f77','f78','f82','f83','f85','f86','f87','f89','f90',
'f92', 'f93', 'f95', 'f96', 'f98', 'f99','f103', 'f107', 'f108',
'f112', 'f117', 'f119','f123', 'f125','f126', 'f127','f128', 'f129', 'f130',
'f134', 'f136','f141', 'f143', 'f144', 'f146', 'f147', 'f150',
'f152', 'f154', 'f156' ,'f161', 'f162', 'f163', 'f164', 'f169', 'f170',
'f179','f181', 'f182', 'f183','f184','f187','f188','f189',
'f191', 'f192', 'f193','f195', 'f199','f200','f201', 'f206', 'f208', 'f210',
'f211', 'f213', 'f214','f222','f224','f225','f226','f227','f229','f230',
'f231', 'f232', 'f239', 'f240','f241']]

In [None]:
df_cat_cols = df[[ 'f22','f43','f243', 'f247','f252', 'f254', 'f255', 'f256', 'f258', 'f259', 'f260','f261', 'f265', 'f266', 'f267', 'f269',
'f271', 'f273', 'f274', 'f275', 'f276', 'f277','f278','f279','f280','f281', 'f282', 'f283','f284']]

In [None]:
df_cat_cols.head()


In [None]:
df_cat_cols.info()

In [None]:
df[['f1', 'f2', 'f3', 'f4', 'f8','f12', 'f13', 'f14', 'f16','f17','f18','f19','f20','f31','f32','f33','f34','f35','f36','f39','f40',
'f42','f44','f48','f52', 'f56','f58','f63','f69','f72','f73','f74','f75','f77','f78','f82','f83','f85','f86','f87','f89','f90',
'f92', 'f93', 'f95', 'f96', 'f98', 'f99','f103', 'f107', 'f108','f112', 'f117', 'f119','f123', 'f125','f126', 'f127','f128', 'f129', 'f130',
'f134', 'f136','f141', 'f143', 'f144', 'f146', 'f147', 'f150','f152', 'f154', 'f156' ,'f161', 'f162', 'f163', 'f164', 'f169', 'f170',
'f179','f181', 'f182', 'f183','f184','f187','f188','f189','f191', 'f192', 'f193','f195', 'f199','f200','f201', 'f206', 'f208', 'f210',
'f211', 'f213', 'f214','f222','f224','f225','f226','f227','f229','f230','f231', 'f232', 'f239', 'f240','f241']]= pd.DataFrame(preprocessing.scale(df[['f1', 'f2', 'f3', 'f4', 'f8','f12', 'f13', 'f14', 'f16','f17','f18','f19','f20',
'f31','f32','f33','f34','f35','f36','f39','f40','f42','f44','f48','f52', 'f56','f58','f63','f69',
'f72','f73','f74','f75','f77','f78','f82','f83','f85','f86','f87','f89','f90','f92', 'f93', 'f95', 'f96', 'f98', 'f99','f103', 'f107', 'f108',
'f112', 'f117', 'f119','f123', 'f125','f126', 'f127','f128', 'f129', 'f130','f134', 'f136','f141', 'f143', 'f144', 'f146', 'f147', 'f150',
'f152', 'f154', 'f156' ,'f161', 'f162', 'f163', 'f164', 'f169', 'f170','f179','f181', 'f182', 'f183','f184','f187','f188','f189',
'f191', 'f192', 'f193','f195', 'f199','f200','f201', 'f206', 'f208', 'f210','f211', 'f213', 'f214','f222','f224','f225','f226','f227','f229','f230',
'f231', 'f232', 'f239', 'f240','f241']]))

In [None]:
df[['f1', 'f2', 'f3', 'f4', 'f8','f12', 'f13', 'f14', 'f16','f17','f18','f19','f20',
'f31','f32','f33','f34','f35','f36','f39','f40',
'f42','f44','f48','f52', 'f56','f58','f63','f69',
'f72','f73','f74','f75','f77','f78','f82','f83','f85','f86','f87','f89','f90',
'f92', 'f93', 'f95', 'f96', 'f98', 'f99','f103', 'f107', 'f108',
'f112', 'f117', 'f119','f123', 'f125','f126', 'f127','f128', 'f129', 'f130',
'f134', 'f136','f141', 'f143', 'f144', 'f146', 'f147', 'f150',
'f152', 'f154', 'f156' ,'f161', 'f162', 'f163', 'f164', 'f169', 'f170',
'f179','f181', 'f182', 'f183','f184','f187','f188','f189',
'f191', 'f192', 'f193','f195', 'f199','f200','f201', 'f206', 'f208', 'f210',
'f211', 'f213', 'f214','f222','f224','f225','f226','f227','f229','f230',
'f231', 'f232', 'f239', 'f240','f241']].head()

In [None]:
df[['f1', 'f2', 'f3', 'f4', 'f8','f12', 'f13', 'f14', 'f16','f17','f18','f19','f20',
'f31','f32','f33','f34','f35','f36','f39','f40',
'f42','f44','f48','f52', 'f56','f58','f63','f69',
'f72','f73','f74','f75','f77','f78','f82','f83','f85','f86','f87','f89','f90',
'f92', 'f93', 'f95', 'f96', 'f98', 'f99','f103', 'f107', 'f108',
'f112', 'f117', 'f119','f123', 'f125','f126', 'f127','f128', 'f129', 'f130',
'f134', 'f136','f141', 'f143', 'f144', 'f146', 'f147', 'f150',
'f152', 'f154', 'f156' ,'f161', 'f162', 'f163', 'f164', 'f169', 'f170',
'f179','f181', 'f182', 'f183','f184','f187','f188','f189',
'f191', 'f192', 'f193','f195', 'f199','f200','f201', 'f206', 'f208', 'f210',
'f211', 'f213', 'f214','f222','f224','f225','f226','f227','f229','f230',
'f231', 'f232', 'f239', 'f240','f241']].describe().T # mean close to 0 and standard deviation = 1

In [None]:
df.head()

In [None]:
df.shape

In [None]:
del df_cat_cols

In [None]:
del df_numeric_cols

In [None]:
X = df.drop('target',axis=1)
Y = df['target']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
x_train.shape, y_train.shape,x_test.shape, y_test.shape

In [None]:
x_train.head()

In [None]:
y_train.head()

In [None]:
del features

In [None]:
del target

In [None]:
del df

# RandomForest Classifier

# CPU

In [None]:
rnd_clf = RandomForestClassifier(n_estimators=50, random_state=42)

In [None]:
rnd_clf.fit(x_train, y_train)

In [None]:

y_pred = rnd_clf.predict(x_test)
print(accuracy_score(y_test, y_pred))

In [None]:
plot_confusion_matrix(rnd_clf,x_test,y_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
predictions = []
for tree in rnd_clf.estimators_:
    predictions.append(tree.predict_proba(x_test)[None, :])

In [None]:
predictions = np.vstack(predictions)

In [None]:
cum_mean = np.cumsum(predictions, axis=0)/np.arange(1, predictions.shape[0] + 1)[:, None, None]

In [None]:
scores = []
for pred in cum_mean:
    scores.append(accuracy_score(y_test, np.argmax(pred, axis=1)))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(scores, linewidth=3)
plt.xlabel('num_trees')
plt.ylabel('accuracy');

# StratifiedKFold Cross Validation

In [None]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state= 40)
acc = []
test_predictions = []

In [None]:
for fold, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):
    x_train_fold, x_test_fold = x_train.iloc[train_index], x_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    print("Fold ",fold)
    rnd_clf.fit(x_train_fold, y_train_fold)
    acc.append(rnd_clf.score(x_test_fold, y_test_fold))
    print('Accuracy:', acc)
    preds = rnd_clf.predict(x_test)
    test_predictions.append(preds)

In [None]:
# Print the output.
print('Accuracy:', acc)


In [None]:
del acc

In [None]:
y_pred = np.mean(np.column_stack(test_predictions),axis=1)
y_pred = y_pred.astype('int32')
y_test = y_test.astype('int32')
print(accuracy_score(y_test,y_pred))

# Important Features

In [None]:
importances = rnd_clf.feature_importances_

In [None]:
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]

In [None]:
print(x_train.columns[sorted_indices])

In [None]:
figure(figsize=(18,25), dpi=80)
plt.title('Feature Importance')
plt.bar(range(x_train.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(x_train.shape[1]), x_train.columns[sorted_indices], rotation=90)
plt.tight_layout()
plt.show()

# Random Forest and GPU

In [None]:
# cuml Random Forest params     
cu_rf_params = { 'n_estimators': 500, 'max_depth':8 } 

In [None]:
x_train = x_train.astype('float32')

In [None]:
y_train = y_train.astype('float32')

In [None]:
x_test = x_test.astype('float32')

In [None]:
y_test = y_test.astype('float32')

In [None]:
cu_rf = cuRF(**cu_rf_params)
cu_rf.fit(x_train, y_train)  

In [None]:
y_pred = cu_rf.predict(x_test)
print(accuracy_score(y_test, y_pred)) # 0.75919

# Picking the top 10 features from the prior trained model and the feature importance plot

In [None]:
x_train = x_train[['f22', 'f179', 'f69', 'f58', 'f136', 'f214', 'f156', 'f78', 'f8', 'f12']]

In [None]:
cu_rf.fit(x_train, y_train)  

In [None]:
x_test = x_test[['f22', 'f179', 'f69', 'f58', 'f136', 'f214', 'f156', 'f78', 'f8', 'f12']]

In [None]:
y_pred = cu_rf.predict(x_test)
print(accuracy_score(y_test, y_pred))

# Stratified Cross Validation

In [None]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state= 40)
val_acc = []
test_predictions = []


In [None]:
for fold, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):
    x_train_fold, x_test_fold = x_train.iloc[train_index], x_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    print('Fold', fold )
    
    cu_rf.fit(x_train_fold, y_train_fold)
    print("score : ",cu_rf.score(x_train_fold, y_train_fold))
    
    y_pred = cu_rf.predict(x_test_fold)
    print(accuracy_score(y_test_fold, y_pred))
    
    preds = cu_rf.predict(x_test)
    test_predictions.append(preds)
    
   

In [None]:
y_pred = np.mean(np.column_stack(test_predictions), axis=1)
y_pred = y_pred.astype('int32')

In [None]:
y_test = y_test.astype('int32')

In [None]:
print(accuracy_score(y_test, y_pred))

# Test Data and Submission file

In [None]:
test_data = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/test.csv")
test_df=test_data[['f1', 'f2', 'f3', 'f4', 'f8','f12', 'f13', 'f14', 'f16','f17','f18','f19','f20',
'f22','f31','f32','f33','f34','f35','f36','f39','f40',
'f42','f43','f44','f48','f52', 'f56','f58','f63','f69',
'f72','f73','f74','f75','f77','f78','f82','f83','f85','f86','f87','f89','f90',
'f92', 'f93', 'f95', 'f96', 'f98', 'f99','f103', 'f107', 'f108',
'f112', 'f117', 'f119','f123', 'f125','f126', 'f127','f128', 'f129', 'f130',
'f134', 'f136','f141', 'f143', 'f144', 'f146', 'f147', 'f150',
'f152', 'f154', 'f156' ,'f161', 'f162', 'f163', 'f164', 'f169', 'f170',
'f179','f181', 'f182', 'f183','f184','f187','f188','f189',
'f191', 'f192', 'f193','f195', 'f199','f200','f201', 'f206', 'f208', 'f210',
'f211', 'f213', 'f214','f222','f224','f225','f226','f227','f229','f230',
'f231', 'f232', 'f239', 'f240','f241', 'f243', 'f247','f252', 'f254', 'f255', 'f256', 'f258', 'f259', 'f260','f261', 'f265', 'f266', 'f267', 'f269',
'f271', 'f273', 'f274', 'f275', 'f276', 'f277','f278','f279','f280','f281', 'f282', 'f283','f284']]
submission = pd.DataFrame({"id": test_data["id"], "target": rnd_clf.predict(test_df)})
submission.to_csv("submission.csv", index = False)