In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')

In [None]:
df_train.columns

In [None]:
df_train = df_train.drop('id', axis = 1)

In [None]:
df_train.head()

In [None]:
df_train.describe()

In [None]:
df_train.info()

#### *From Dataset information method - we can find that all feature values are ***discrete variables*** (indicating it could be ordinal value of some categorical data).*

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize = (15,15))
plt.boxplot(df_train.iloc[:, :-1], vert = False, showfliers=True)
plt.show()

#### *There are significant quantity of outliers in the dataset for each feature.*
#### *We will check the outlier count above 95% IQR to decide on the Feature Scaler to apply*
#### ( *MinMaxScaler* - significant outlier impact, *Robust scaler* - no impact of outliers)

##### **Boolean Dataframe with values > 1.5 times 0.95 IQR value for each feature**

In [None]:
df_ot3 = df_train.quantile(0.95)
df_ot4 = df_train > (df_ot3*1.5)
df_ot4.shape

In [None]:
for i in range(0,50):
    print("Feature \n",i)
    print(df_ot4.iloc[:,i].value_counts())

#### We can see that the True value count for *IQR(0.95)X1.5* are more than ***1.5% of the data***. So we will go with ***MinMaxscaler*** that includes outliers in the process.

In [None]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [None]:
X = df_train.iloc[:, :-1]
X.head()

In [None]:
y = df_train.iloc[:, -1]
y

In [None]:
Lenc = LabelEncoder()
Lenc.fit(y)
y = Lenc.transform(y)

In [None]:
y

In [None]:
mnmx_scl = MinMaxScaler()
mnmx_scl.fit(X)
X = mnmx_scl.transform(X)

In [None]:
X_df = pd.DataFrame(X)
X_df.describe()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size = 0.4, stratify = y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#### With Stratified split we are able to extract proportional test data

In [None]:
dpi = 200
plt.figure(figsize = (10,10))
plt.hist(y_train, label = 'y train data class count')
plt.hist(y_test, label = 'y test data class count')
plt.legend()

### XGBClassifier iter1 - Model train and performance measures

In [None]:
from xgboost import XGBClassifier
import xgboost
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, make_scorer

XGBClassifier() --> GridSearchCV done in previous notebook version and below are the best params estimate: n_estimators = 200, learning_rate = 0.1

In [None]:
xgb_clf = XGBClassifier(learning_rate = 0.1, n_estimators = 200,use_label_encoder = False, verbose= None, objective = 'multi:softmax', eval_metric = 'mlogloss',eval_set = [X_test, y_test])

xgb_clf.fit(X_train, y_train)

In [None]:
y_pred_xgb_pr = xgb_clf.predict_proba(X_test)

y_pred_xgb = xgb_clf.predict(X_test)

In [None]:
acc_scr_xgb = accuracy_score(y_test, y_pred_xgb)
acc_scr_xgb

In [None]:
auc_score_xgb = roc_auc_score(y_test, y_pred_xgb_pr, multi_class = 'ovr')
auc_score_xgb

In [None]:
clf_xgb = classification_report(y_test, y_pred_xgb)
print(clf_xgb)

In [None]:
cfm_xgb = confusion_matrix(y_test, y_pred_xgb)

In [None]:
plt.figure(figsize = (8, 6))
sns.heatmap(cfm_xgb, annot = True)
plt.xlabel("Predicted")
plt.ylabel("Actual")

### XGBClassifier iter2 - with PCA feature reduction technique - Performance measures

In [None]:
from sklearn.decomposition import PCA

In [None]:
n_comp = np.array([25,30,35,40])

acc_scr_xgb2 = np.linspace(0,0,5)
auc_score_xgb2 = np.linspace(0,0,5)

In [None]:
xgb_clf2 = XGBClassifier(learning_rate = 0.1, n_estimators = 200,use_label_encoder = False, verbose= None, objective = 'multi:softmax', eval_metric = 'mlogloss')

In [None]:
for i,j in enumerate(n_comp):
    pca = PCA(n_components = j)
    pca.fit(X)
    X_pca = (pca.transform(X))
    X_pca_train, X_pca_test, y_train, y_test = train_test_split(X_pca, y, test_size = 0.4, stratify = y)
    xgb_clf2.fit(X_pca_train, y_train)
    y_pred_xgb_pr2 = xgb_clf2.predict_proba(X_pca_test)
    y_pred_xgb2 = xgb_clf2.predict(X_pca_test)
    acc_scr_xgb2[i] = accuracy_score(y_test, y_pred_xgb2)
    print("Iteration", i , "accuracy_score =,",  acc_scr_xgb2[i])
    auc_score_xgb2[i] = roc_auc_score(y_test, y_pred_xgb_pr2, multi_class = 'ovr')
    print("Iteration", i , "roc_auc_score =,",  auc_score_xgb2[i])

In [None]:
auc_scr_list = np.linspace(0,0,4)
acc_scr_list = np.linspace(0,0,4)

for i in range(len(auc_score_xgb2)):
    if(i<4):
        auc_scr_list[i] = auc_score_xgb2[i]
        acc_scr_list[i] = acc_scr_xgb2[i]

In [None]:
dpi = 200
fig, ax = plt.subplots()
ax.plot(n_comp, np.array(acc_scr_list), label='Accuracy ')
ax.plot(n_comp, np.array(auc_scr_list), label='ROC auc score') 
plt.figure(figsize = (10, 10))
ax.set_xlabel('PCA reduced Features') 
ax.set_ylabel('score') 
ax.set_title("PCA num of Features vs scores")
ax.legend() 
plt.show()

##### - As we have checked PCA with various iterations, *reduced num of features = 35* have given the *highest auc_score (55.94%) and accuracy (57.5%)*
##### - However the value is less than the performance score achieved with all the set of features in dataset (without PCA)

In [None]:
feat_importances = xgb_clf.feature_importances_

In [None]:
feat_list = X_train.columns

In [None]:
plt.figure(figsize = (15,15))
plt.barh(list(feat_list), feat_importances)
plt.yticks(list(range(0,50)))
plt.xlabel("Feature score")
plt.ylabel("Feature list")

#### XGBClassifier feature importances are as follows:
#### ***Feature 2, Feature 13, Feature 32, Feature 44, Feature 29***

### XGBClassifier iter3 - with sample weight distribution impact and performance measures

In [None]:
sample_wt_dict = { 0 : 1.5 , 1: 1, 2: 1, 3: 1.5}

In [None]:
plt.hist(y_train)

In [None]:
tot = len(y_train)
print("Class 0 ratio", len(y_train[y_train==0])/tot)
print("Class 1 ratio", len(y_train[y_train==1])/tot)
print("Class 2 ratio", len(y_train[y_train==2])/tot)
print("Class 3 ratio", len(y_train[y_train==3])/tot)

In [None]:
weight_list = [sample_wt_dict[i] for i in y_train]
len(weight_list)

In [None]:
xgb_clf3 = XGBClassifier(learning_rate = 0.1, n_estimators = 200,use_label_encoder = False, verbose= None, objective = 'multi:softmax')

xgb_clf3.fit(X_train, y_train, sample_weight = weight_list)

In [None]:
y_pred_xgb_pr3 = xgb_clf3.predict_proba(X_test)

y_pred_xgb3 = xgb_clf3.predict(X_test)

In [None]:
acc_scr_xgb3 = accuracy_score(y_test, y_pred_xgb3)
acc_scr_xgb3

In [None]:
auc_score_xgb3 = roc_auc_score(y_test, y_pred_xgb_pr3, multi_class = 'ovr')
auc_score_xgb3

##### *With sample weights **increase of class0 and class3 upto two times of class1 and class2** - the accuracy remains at the same level as iteration 1 and roc_auc_score have reduced.*

### Random Forest Classifier - Model train and performance measures

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
random_forest_clf = RandomForestClassifier()

params = {'max_depth': [3, 10], 'n_estimators': [100, 200]}

rand_for_cv = RandomizedSearchCV(random_forest_clf, params, cv=5)

rand_for_cv.fit(X_train, y_train)

rand_for_cv.best_params_

##### Applying best params from Random search CV

In [None]:
random_forest_clf_iter = RandomForestClassifier(max_depth = 3, n_estimators = 100)

random_forest_clf_iter.fit(X_train, y_train)

In [None]:
y_pred_rf = random_forest_clf_iter.predict(X_test)

y_pred_rf_pr = random_forest_clf_iter.predict_proba(X_test)

In [None]:
acc_scr_rf = accuracy_score(y_test, y_pred_rf)
acc_scr_rf

In [None]:
roc_scor_rf = roc_auc_score(y_test, y_pred_rf_pr, multi_class='ovr')
roc_scor_rf

In [None]:
clf_rep_rf = classification_report(y_test, y_pred_rf)
print(clf_rep_rf)

### Model performance metrics
#### - XGBClassifier iteration 1 ***without class imbalance corrections and PCA*** have higher score compared to *RandomForest Classifier* 
#### - Accuracy are almost at the same level but the roc_auc_score is higher.

### Model explainability with SHAP

In [None]:
import shap

In [None]:
%%time
explainer = shap.TreeExplainer(xgb_clf)

shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test)

#### Above is the SHAP library interpretation of XGBClassifier iteration 1
#####    **We can infer the impact of each feature on target variable outcome:**
#####    **feature 14 has high impact based followed by *features 37, 6, 15, 31***

#### ***Class 0*** target variable outcome SHAP summary plot:

In [None]:
shap.summary_plot(shap_values[0], X_test)

* From above summary plot for Class 0 outcome, it is inferred that:
*   - ***features 25** has wide impact based on shap values (range -0.65 to 0.6) followed by ***features 38, 6, 17***
*   - ***features 37** has positive impact as some of the instances extend until 0.7 followed ***features 9, 30, 0, 45, 35***
*   - ***feature 14** has negative impact *(SHAP value: -0.6)*

In [None]:
shap.dependence_plot(25, shap_values[0], X_test, interaction_index='auto')

* - *From above interaction plot we can infer that **lower feature value of 25 has higher impact** on outcome variable which gradually reduces.*
* - *Impact of feature 4 on feature 25 is uniformly distributed with some of the highvalue points near -0.2 shap value*

#### ***Class 2*** target variable outcome SHAP summary plot:

In [None]:
shap.summary_plot(shap_values[2], X_test)

* From above summary plot for Class 2 outcome, it is inferred that:
*   - ***features 5** has wide impact based on shap values (range -1 to 0.6) followed by ***features 34, 32, 42, 16***
*   - ***features 12** has positive impact as some of the instances extend until 1.1 followed ***features 44, 2, 47, 22***
*   - ***feature 15 and 38** have negative impact on this class outcome

In [None]:
shap.dependence_plot(5, shap_values[2], X_test, interaction_index='auto')

* *From above interaction plot we can infer that **lower feature value of 5 has wide impact** on outcome variable which gradually has positive impact as it increases*
* *Impact of feature 34 on feature 5 is uniformly distributed*

#### *Actual test data file load, predict and file submission*

In [None]:
df_test_data = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')

In [None]:
df_test_data.head()

In [None]:
df_test_data = df_test_data.drop('id', axis = 1)

In [None]:
mnmx_scl = MinMaxScaler()
mnmx_scl.fit(df_test_data)
X_test1 = mnmx_scl.transform(df_test_data)

In [None]:
X_test_df = pd.DataFrame(X_test1)

In [None]:
X_test_df.describe()

In [None]:
y_pred_test_prob = xgb_clf.predict_proba(X_test_df)
y_pred_test_prob

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

In [None]:
# create submission file

class_labels = ["Class_1","Class_2","Class_3","Class_4"]

sample_submission.drop(columns=class_labels, inplace=True)

submission = (sample_submission.join(pd.DataFrame(data=y_pred_test_prob, columns=class_labels)))
submission.to_csv("my_submission.csv", index=False)
submission