### References:

1.Abhishek Thakur's [Approaching Almost Any Machine Learning Problem Book](https://github.com/abhi1thakur/approachingalmost)

2.Gunes Evitan - [Advanced Feature Engineering Tutorial](https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')

In [None]:
train.head()

In [None]:
test.dtypes

In [None]:
train['target'].value_counts()

In [None]:
test.head()

In [None]:
train.shape,test.shape

# Baseline Model

## Creating Stratified K folds:

In [None]:
def create_folds(data,num_splits):
    data['kfolds']=-1
    
    sk=StratifiedKFold(n_splits=num_splits,random_state=42,shuffle=True)
    
    for i,(t_,v_) in enumerate(sk.split(X=data.drop('target',axis=1),y=data.target.values)):
        data.loc[v_,'kfolds']=i
        
    print(f"Summary of the stratified folds\n:{data['kfolds'].value_counts()}")   
    return data

In [None]:
folds_data=create_folds(train,5)

In [None]:
le=LabelEncoder()
le.fit(folds_data['target'])
folds_data['target']=le.transform(folds_data['target'])

In [None]:
test.head()

In [None]:
N=4
clf=RandomForestClassifier(criterion='gini',
                           max_depth=7,
                           n_estimators=2000,
                           min_samples_split=5,
                           max_features='auto',
                           n_jobs=-1,
                           oob_score=True,
                           random_state=40
                          )

In [None]:
def run(fold):
    prob=pd.DataFrame((np.zeros(shape=(test.shape[0],4))),columns=['Class_1','Class_2','Class_3','Class_4'])
    train_df=folds_data[folds_data['kfolds']==fold].reset_index(drop=True)
    valid_df=folds_data[folds_data['kfolds']!=fold].reset_index(drop=True)
    oof_preds=np.zeros(train_df.shape[0])
    sub_preds=np.zeros(test.shape[0])
    feats=[f for f in train_df.columns if f not in ['target','kfolds','id']]
    feature_imp=pd.DataFrame((np.zeros(train_df[feats].shape[1],)),index=train_df[feats].columns)
    clf.fit(train_df[feats],train_df['target'])
    oof_preds=clf.predict_proba(valid_df[feats])
    #print(oof_preds)
    loss=log_loss(valid_df['target'],oof_preds)
    print(f'Log loss:{np.mean(loss)}')
    prob.loc[:,'Class_1']=clf.predict_proba(test.drop(['id'],axis=1))[:,0]
    prob.loc[:,'Class_2']=clf.predict_proba(test.drop(['id'],axis=1))[:,1]
    prob.loc[:,'Class_3']=clf.predict_proba(test.drop(['id'],axis=1))[:,2]
    prob.loc[:,'Class_4']=clf.predict_proba(test.drop(['id'],axis=1))[:,3]
    feature_imp.iloc[:,0]=clf.feature_importances_
    #print(feature_imp)
    feature_imp.rename(columns={feature_imp.columns[0]:f'fold_{fold}'},inplace=True)
    print(f'OOB Score:{clf.oob_score_}')
    return prob,feature_imp
    

In [None]:
prob_1,feature_imp_1=run(0)

In [None]:
prob_2,feature_imp_2=run(1)

In [None]:
prob_3,feature_imp_3=run(2)

In [None]:
prob_4,feature_imp_4=run(3)

In [None]:
prob_5,feature_imp_5=run(4)

In [None]:
prob=pd.concat([prob_1,prob_2,prob_3,prob_4,prob_5],axis=0).groupby(level=0).mean()
feature_importance=pd.concat([feature_imp_1,feature_imp_2,feature_imp_3,feature_imp_4,feature_imp_5],axis=1)

In [None]:
feature_importance['fold_avg']=feature_importance.mean(axis=1)
feature_importance.sort_values(by='fold_avg',ascending=False,inplace=True)
feature_importance

In [None]:
sample_submission=pd.DataFrame()
sample_submission.loc[:,'id']=test['id']
sample_submission.loc[:,['Class_1','Class_2','Class_3','Class_4']]=prob

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('sample_submission.csv',index=False)