In [1]:
#!pip install

In [2]:
import pandas as pd
import numpy as np

#
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

#

import seaborn as sns
import plotly.express as px

#

import os
import random
import re
import math
import time

from tqdm import tqdm
from tqdm.keras import TqdmCallback

#from pandas_summary import DataFrameSummary
#from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from keras.preprocessing import image
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

import warnings
from missingpy import MissForest
from PIL import Image
warnings.filterwarnings('ignore') 
%matplotlib notebook
%matplotlib inline

In [3]:
seed_val = 101
random.seed(seed_val)
np.random.seed(seed_val)
# Setting color palette.
orange_black = [
    '#fdc029', '#df861d', '#FF6347', '#aa3d01', '#a30e15', '#800000', '#171820'
]

# Setting plot styling.
plt.style.use('ggplot')

In [4]:
# df = pd.read_csv('./full_data.csv', index_col = 0).rename(columns = {'duplicated': 'duplicate', 'class':'label'})
# color_df = pd.read_csv('./full_data_with_color_data.csv')
# df = df.merge(color_df[['image_id', 'reds', 'greens', 'blues', 'mean_colors']], on='image_id', how="inner")
# df.head()

###Load Non-Image Metadata

In [5]:
df = pd.read_csv('./full_data_v2.csv', index_col = 0).rename(columns = {'duplicated': 'duplicate', 'class':'label'})
color_df = pd.read_csv('./full_data_with_color_data.csv')
df = df.merge(color_df[['image_id', 'reds', 'greens', 'blues', 'mean_colors']], on='image_id', how="inner")
df = df[df['duplicate'] == False]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62360 entries, 1182 to 63541
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   image_id      62360 non-null  object 
 1   diagnosis     62360 non-null  object 
 2   age           62360 non-null  float64
 3   sex           62360 non-null  object 
 4   localization  62360 non-null  object 
 5   source        62360 non-null  object 
 6   severity      62360 non-null  object 
 7   path          62360 non-null  object 
 8   label         62360 non-null  object 
 9   duplicate     62360 non-null  bool   
 10  dataset       62360 non-null  object 
 11  split_1       57869 non-null  object 
 12  split_2       36695 non-null  object 
 13  split_3       18000 non-null  object 
 14  label_1       62360 non-null  object 
 15  label_2       61785 non-null  object 
 16  label_3       61785 non-null  object 
 17  split_4       35236 non-null  object 
 18  split_5       34661 non

In [6]:
for label in ['label_1','label_2','label_3']:
    print(df[label].value_counts(normalize = True))

Unclassified                                          0.516661
Benign Marking or Mole                                0.286754
Non-Cancerous Skin Condition                          0.074343
Toxin, Fungal, Bug, Viral, or Bacterial Infections    0.064448
Potentially Malignant Skin Tumors                     0.057793
Name: label_1, dtype: float64
Unclassified                                          0.521470
Benign Marking or Mole                                0.289423
Non-Cancerous Skin Condition                          0.075034
Toxin, Fungal, Bug, Viral, or Bacterial Infections    0.065048
Potentially Malignant Skin Tumors                     0.049025
Name: label_2, dtype: float64
Unclassified                         0.521470
Benign Marking or Mole               0.289423
Potentially Malignant Skin Tumors    0.114073
Non-Cancerous Skin Condition         0.075034
Name: label_3, dtype: float64


In [7]:
df2=df.replace('unknown',np.NaN).replace(0.0, np.NaN).drop(['duplicate', 'source', 'dataset', 'image_id',
                                                           'label', 'label_1', 'label_2', 'split_1', 'split_2',
                                                           'split_3', 'split_4', 'split_5', 'split_6',
                                                            'severity'], axis=1).rename(columns = {'label_3': 'label'})
df2['diagnosis'].fillna('unknown', inplace=True)
df2.head()

Unnamed: 0,diagnosis,age,sex,localization,path,label,reds,greens,blues,mean_colors
1182,benign keratosis-like lesions,80.0,male,scalp,./Data/ISIC_2018/Train/HAM10000_images_part_1_...,Benign Marking or Mole,208.736267,162.703426,181.659333,184.366342
1183,benign keratosis-like lesions,80.0,male,scalp,./Data/ISIC_2018/Train/HAM10000_images_part_1_...,Benign Marking or Mole,197.138056,156.542415,177.196333,176.958935
1184,benign keratosis-like lesions,80.0,male,scalp,./Data/ISIC_2018/Train/HAM10000_images_part_1_...,Benign Marking or Mole,214.053785,156.414959,174.070967,181.513237
1185,benign keratosis-like lesions,80.0,male,scalp,./Data/ISIC_2018/Train/HAM10000_images_part_1_...,Benign Marking or Mole,195.708563,142.608015,157.175893,165.164157
1186,benign keratosis-like lesions,75.0,male,ear,./Data/ISIC_2018/Train/HAM10000_images_part_1_...,Benign Marking or Mole,211.709311,167.980289,185.249274,188.312958


In [8]:
df2.label.value_counts(normalize = True)

Unclassified                         0.521470
Benign Marking or Mole               0.289423
Potentially Malignant Skin Tumors    0.114073
Non-Cancerous Skin Condition         0.075034
Name: label, dtype: float64

#Impute Missing Updated

In [9]:
# df2['anatomy_impute_mode']=df2['localization'].fillna(df2['localization'].mode()[0])
#df2['anatomy_impute_mode'].value_counts(normalize = True, dropna = False)

In [10]:
df2['missing_anatomy'] = df2['localization'].isna().astype(int)
df2['anatomy_impute_mode']=df2['localization'].fillna(df2['localization'].mode()[0])
df2['missing_sex'] = df2['sex'].isna().astype(int)
df2['sex_impute_mode']=df2['sex'].fillna(df2['sex'].mode()[0])
df2['missing_age'] = df2['age'].isna().astype(int)
df2['age_impute_mode']=df2['age'].fillna(df2['age'].mode()[0])
df2['age_impute_median']=df2['age'].fillna(df2['age'].median())
df2['age_impute_mean']=df2['age'].fillna(df2['age'].mean())
df2['age_impute_max']=df2['age'].fillna(df2['age'].max())
df2['age_impute_min']=df2['age'].fillna(df2['age'].min())

In [11]:
### impute by research
###: research paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6064677/ 
###: research paper: https://www.cancer.org/cancer/melanoma-skin-cancer/about/key-statistics.html
# df2['age_impute_research'] = df2['age']
# df2.loc[(df2['label'] == 'Potentially Malignant Skin Tumors') & (df2['age_impute_research'].isna()), 'age_impute_research'] = 65
# df2.loc[(df2['label'] == 'Benign Marking or Mole') & (df2['age_impute_research'].isna()), 'age_impute_research'] = 85
# df2.loc[(df2['label'] == 'Non-Cancerous Skin Condition') & (df2['age_impute_research'].isna()), 'age_impute_research'] = 60
# df2.loc[(df2['label'] == 'Toxin, Fungal, Bug, Viral, or Bacterial Infections') & (df2['age_impute_research'].isna()), 'age_impute_research'] = 80
# df2['age_impute_research'] = df2['age_impute_research'].fillna(df2['age'].mean())
# df2['age_impute_research'] = df2['age_impute_research'].astype(float)

# df2['sex_impute_research'] = df2['sex']
# df2.loc[(df2['label'] == 'Potentially Malignant Skin Tumors') & (df2['sex_impute_research'].isna()), 'sex_impute_research'] = 'female'
# df2.loc[(df2['label'] == 'Benign Marking or Mole') & (df2['sex_impute_research'].isna()), 'sex_impute_research'] = 'male'
# df2.loc[(df2['label'] == 'Non-Cancerous Skin Condition') & (df2['sex_impute_research'].isna()), 'sex_impute_research'] = 'male'
# df2.loc[(df2['label'] == 'Toxin, Fungal, Bug, Viral, or Bacterial Infections') & (df2['sex_impute_research'].isna()), 'sex_impute_research'] = 'female'
# df2['sex_impute_research'] = df2['sex_impute_research'].fillna(df2['sex'].mode()[0])
#df2.age_impute_research.value_counts(dropna=False)

In [12]:
df3 = df2.drop(['path', 'diagnosis'], axis =1)

In [13]:
dict_of_dfs = {}
for col in ['sex', 'localization', 'sex_impute_mode', 'anatomy_impute_mode', 'label']:
    df = pd.DataFrame()
    df3[col] = df3[col].astype('category')
    df = dict(enumerate(df3[col].cat.categories))
    dict_of_dfs[col] = df 
    df3[col + '_cat'] = df3[col].cat.codes.replace(-1,np.NaN).astype('category')
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62360 entries, 1182 to 63541
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   age                      42975 non-null  float64 
 1   sex                      43019 non-null  category
 2   localization             42380 non-null  category
 3   label                    61785 non-null  category
 4   reds                     62360 non-null  float64 
 5   greens                   62360 non-null  float64 
 6   blues                    62360 non-null  float64 
 7   mean_colors              62360 non-null  float64 
 8   missing_anatomy          62360 non-null  int64   
 9   anatomy_impute_mode      62360 non-null  category
 10  missing_sex              62360 non-null  int64   
 11  sex_impute_mode          62360 non-null  category
 12  missing_age              62360 non-null  int64   
 13  age_impute_mode          62360 non-null  float64 
 14  age

In [14]:
imputer = MissForest()
#label = df3.label
df3_X = df3.drop(['label', 'sex', 'localization', 'sex_impute_mode', 'anatomy_impute_mode'], axis = 1)
cat_cols = [df3_X.columns.get_loc(col) for col in df3_X.select_dtypes(['category']).columns.tolist()]
df3_X_imputed = imputer.fit_transform(df3_X, cat_vars=cat_cols)
df3_X_imputed = pd.DataFrame(df3_X_imputed, columns=df3_X.columns.tolist()).rename(columns={'age': 'age_impute_mf'})
#df3_imputed = pd.concat([df3_X_imputed, label], axis = 1)
#df3_imputed = pd.merge(df3_X_imputed, label, left_index=True, right_index=True)
df3_imputed = df3_X_imputed
df3_imputed.head()

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5


Unnamed: 0,age_impute_mf,reds,greens,blues,mean_colors,missing_anatomy,missing_sex,missing_age,age_impute_mode,age_impute_median,age_impute_mean,age_impute_max,age_impute_min,sex_cat,localization_cat,sex_impute_mode_cat,anatomy_impute_mode_cat,label_cat
0,80.0,208.736267,162.703426,181.659333,184.366342,0.0,0.0,0.0,80.0,80.0,80.0,80.0,80.0,1.0,14.0,1.0,14.0,0.0
1,80.0,197.138056,156.542415,177.196333,176.958935,0.0,0.0,0.0,80.0,80.0,80.0,80.0,80.0,1.0,14.0,1.0,14.0,0.0
2,80.0,214.053785,156.414959,174.070967,181.513237,0.0,0.0,0.0,80.0,80.0,80.0,80.0,80.0,1.0,14.0,1.0,14.0,0.0
3,80.0,195.708563,142.608015,157.175893,165.164157,0.0,0.0,0.0,80.0,80.0,80.0,80.0,80.0,1.0,14.0,1.0,14.0,0.0
4,75.0,211.709311,167.980289,185.249274,188.312958,0.0,0.0,0.0,75.0,75.0,75.0,75.0,75.0,1.0,4.0,1.0,4.0,0.0


In [15]:
cat_cols = df3_imputed.loc[:, df3_imputed.columns.str.endswith("_cat")].columns.to_list()
for col in cat_cols:
    df3_X_imputed[col] = df3_X_imputed[col].astype('int')
df3_X_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62360 entries, 0 to 62359
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age_impute_mf            62360 non-null  float64
 1   reds                     62360 non-null  float64
 2   greens                   62360 non-null  float64
 3   blues                    62360 non-null  float64
 4   mean_colors              62360 non-null  float64
 5   missing_anatomy          62360 non-null  float64
 6   missing_sex              62360 non-null  float64
 7   missing_age              62360 non-null  float64
 8   age_impute_mode          62360 non-null  float64
 9   age_impute_median        62360 non-null  float64
 10  age_impute_mean          62360 non-null  float64
 11  age_impute_max           62360 non-null  float64
 12  age_impute_min           62360 non-null  float64
 13  sex_cat                  62360 non-null  int64  
 14  localization_cat      

In [16]:
df3_imputed2 = df3_imputed
for d in list(dict_of_dfs.keys()):
    if d in ['sex_impute_mode', 'anatomy_impute_mode', 'label']:
        df3_imputed2[d] = df3_imputed[d + '_cat'].map(dict_of_dfs[d])
        df3_imputed2.drop([d + '_cat'], axis = 1, inplace = True)
    else:
        df3_imputed2[d + '_impute_mf'] = df3_imputed[d + '_cat'].map(dict_of_dfs[d])
        df3_imputed2.drop([d + '_cat'], axis = 1, inplace = True)
df3_imputed2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62360 entries, 0 to 62359
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age_impute_mf           62360 non-null  float64
 1   reds                    62360 non-null  float64
 2   greens                  62360 non-null  float64
 3   blues                   62360 non-null  float64
 4   mean_colors             62360 non-null  float64
 5   missing_anatomy         62360 non-null  float64
 6   missing_sex             62360 non-null  float64
 7   missing_age             62360 non-null  float64
 8   age_impute_mode         62360 non-null  float64
 9   age_impute_median       62360 non-null  float64
 10  age_impute_mean         62360 non-null  float64
 11  age_impute_max          62360 non-null  float64
 12  age_impute_min          62360 non-null  float64
 13  sex_impute_mf           62360 non-null  object 
 14  localization_impute_mf  62360 non-null

In [17]:
# cat_cols = [df3_imputed.columns.get_loc(col) for col in df3_X.select_dtypes(['category']).columns.tolist()]
# cat_cols

###Getting ready to model

In [18]:

dummies = pd.get_dummies(df3_imputed2['sex_impute_mf'], prefix='sex_mf')
df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
dummies = pd.get_dummies(df3_imputed2['localization_impute_mf'], prefix='anatomy_mf')
df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
dummies = pd.get_dummies(df3_imputed2['sex_impute_mode'], prefix='sex_mode')
df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
# dummies = pd.get_dummies(df3_imputed2['sex_impute_research'], prefix='sex_research')
# df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
dummies = pd.get_dummies(df3_imputed2['anatomy_impute_mode'], prefix='anatomy_mode')
df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
# dropping not useful columns

df3_imputed2.drop(['sex_impute_mf', 'localization_impute_mf', 'sex_impute_mode', 'anatomy_impute_mode'],\
                  axis=1, inplace=True)
df3_imputed2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62360 entries, 0 to 62359
Data columns (total 54 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age_impute_mf                 62360 non-null  float64
 1   reds                          62360 non-null  float64
 2   greens                        62360 non-null  float64
 3   blues                         62360 non-null  float64
 4   mean_colors                   62360 non-null  float64
 5   missing_anatomy               62360 non-null  float64
 6   missing_sex                   62360 non-null  float64
 7   missing_age                   62360 non-null  float64
 8   age_impute_mode               62360 non-null  float64
 9   age_impute_median             62360 non-null  float64
 10  age_impute_mean               62360 non-null  float64
 11  age_impute_max                62360 non-null  float64
 12  age_impute_min                62360 non-null  float64
 13  l

In [19]:
train = df3_imputed2

In [20]:
#create a mapping from labels to a unique integer and vice versa for labelling and prediction later
labels = train['label'].unique()
i = 0
idx2class = {} 
class2idx = {}
for tp in labels:
    idx2class[i] = tp
    class2idx[tp] = i
    i += 1
idx2class

{0: 'Benign Marking or Mole',
 1: 'Potentially Malignant Skin Tumors',
 2: 'Unclassified',
 3: 'Non-Cancerous Skin Condition'}

In [21]:
train['label'] = train['label'].replace(class2idx)
train.head()

Unnamed: 0,age_impute_mf,reds,greens,blues,mean_colors,missing_anatomy,missing_sex,missing_age,age_impute_mode,age_impute_median,...,anatomy_mode_hand,anatomy_mode_head/neck,anatomy_mode_lower extremity,anatomy_mode_neck,anatomy_mode_oral/genital,anatomy_mode_palms/soles,anatomy_mode_scalp,anatomy_mode_torso,anatomy_mode_trunk,anatomy_mode_upper extremity
0,80.0,208.736267,162.703426,181.659333,184.366342,0.0,0.0,0.0,80.0,80.0,...,0,0,0,0,0,0,1,0,0,0
1,80.0,197.138056,156.542415,177.196333,176.958935,0.0,0.0,0.0,80.0,80.0,...,0,0,0,0,0,0,1,0,0,0
2,80.0,214.053785,156.414959,174.070967,181.513237,0.0,0.0,0.0,80.0,80.0,...,0,0,0,0,0,0,1,0,0,0
3,80.0,195.708563,142.608015,157.175893,165.164157,0.0,0.0,0.0,80.0,80.0,...,0,0,0,0,0,0,1,0,0,0
4,75.0,211.709311,167.980289,185.249274,188.312958,0.0,0.0,0.0,75.0,75.0,...,0,0,0,0,0,0,0,0,0,0


###Cross Validate on Accuracy

In [22]:
# loading modelling libraries

import xgboost as xgb

from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

In [23]:
X = train.drop('label', axis=1)
y = train.label

In [24]:
# taking holdout set for validating with stratified y

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)

# 5 fold stratify for cv

cv = StratifiedKFold(5, shuffle=True, random_state=42)

training_data = {'X_train':X_train,'Y_train':y_train,
                'X_val': X_test,'Y_val':y_test,
                'X_test': X_test,'Y_test':y_test}

In [25]:
# setting model hyperparameters, didn't include fine tuning here because of timing reasons...

xg = xgb.XGBClassifier(
    n_estimators=750,
    min_child_weight=0.81,
    learning_rate=0.025,
    max_depth=2,
    subsample=0.80,
    colsample_bytree=0.42,
    gamma=0.10,
    random_state=42,
    n_jobs=-1,
)

In [26]:
estimators = [xg]

In [27]:
# cross validation scheme

def model_check(X_train, y_train, estimators, cv):
    model_table = pd.DataFrame()

    row_index = 0
    for est in estimators:

        MLA_name = est.__class__.__name__
        model_table.loc[row_index, 'Model Name'] = MLA_name

        cv_results = cross_validate(est,
                                    X_train,
                                    y_train,
                                    cv=cv,
                                    scoring='accuracy',
                                    return_train_score=True,
                                    n_jobs=-1)

        model_table.loc[row_index,
                        'Train accuracy'] = cv_results['train_score'].mean()
        model_table.loc[row_index,
                        'Validation accuracy'] = cv_results['test_score'].mean()
        model_table.loc[row_index, 'Validation Std'] = cv_results['test_score'].std()
        model_table.loc[row_index, 'Time'] = cv_results['fit_time'].mean()

        row_index += 1

    model_table.sort_values(by=['Validation accuracy'],
                            ascending=False,
                            inplace=True)

    return model_table

In [28]:
# display cv results

raw_models = model_check(X_train, y_train, estimators, cv)
display(raw_models)

KeyboardInterrupt: 

### Result on Holdout Group

In [None]:
# fitting train data

xg.fit(X, y)

# predicting on holdout set
holdout = xg.predict(X_test)

# checking results on test set
accuracy_score(y_test, holdout)

In [None]:
# cm = confusion_matrix(y_test, holdout, labels = xg.classes_)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=xg.classes_)
# disp.plot()
# plt.show()

### Confusion Matrix

In [None]:
ConfusionMatrixDisplay.from_predictions(holdout, y_test, display_labels=[i for i in idx2class.values()], 
                                        xticks_rotation = "vertical", cmap='Blues' , normalize = "true")
plt.show()

### Learning Curve

In [None]:
# from sklearn.model_selection import learning_curve
# from sklearn.model_selection import ShuffleSplit


# def plot_learning_curve(
#     estimator,
#     title,
#     X,
#     y,
#     axes=None,
#     ylim=None,
#     cv=None,
#     n_jobs=None,
#     scoring=None,
#     train_sizes=np.linspace(0.1, 1.0, 5),
# ):

#     if axes is None:
#         _, axes = plt.subplots(1, 1, figsize=(10, 5))

#     axes.set_title(title)
#     if ylim is not None:
#         axes.set_ylim(*ylim)
#     axes.set_xlabel("Training examples")
#     axes.set_ylabel("Score")

#     train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
#         estimator,
#         X,
#         y,
#         scoring=scoring,
#         cv=cv,
#         n_jobs=n_jobs,
#         train_sizes=train_sizes,
#         return_times=True,
#     )
#     train_scores_mean = np.mean(train_scores, axis=1)
#     train_scores_std = np.std(train_scores, axis=1)
#     test_scores_mean = np.mean(test_scores, axis=1)
#     test_scores_std = np.std(test_scores, axis=1)

#     # Plot learning curve
#     axes.grid()
#     axes.fill_between(
#         train_sizes,
#         train_scores_mean - train_scores_std,
#         train_scores_mean + train_scores_std,
#         alpha=0.1,
#         color="r",
#     )
#     axes.fill_between(
#         train_sizes,
#         test_scores_mean - test_scores_std,
#         test_scores_mean + test_scores_std,
#         alpha=0.1,
#         color="g",
#     )
#     axes.plot(
#         train_sizes, train_scores_mean, "o-", color="r", label="Training score"
#     )
#     axes.plot(
#         train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
#     )
#     axes.legend(loc="best")

#     return plt


# fig, axes = plt.subplots(1, 1, figsize=(10, 5))

# X, y = X_train, y_train

# title = "Learning Curves"

# cv = ShuffleSplit(n_splits=20, test_size=0.1, random_state=0)

# plot_learning_curve(xg, title, X, y, axes=axes, ylim=(0.60, 0.80),
#                     cv=cv, n_jobs=4)

# plt.show()

### Gridsearch to finetune model hyperparameters

#### initial model parameters

In [None]:
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
xgb1 = XGBClassifier(
    n_estimators=750,
    min_child_weight=0.81,
    learning_rate=0.025,
    max_depth=2,
    subsample=0.80,
    colsample_bytree=0.42,
    gamma=0.10,
    random_state=42,
    n_jobs=-1,
    objective='multi:softmax',
    nthread=4
)

In [None]:
def getTrainScores(gs):
    results = {}
    runs = 0
    for x,y in zip(list(gs.cv_results_['mean_test_score']), gs.cv_results_['params']):
        results[runs] = 'mean:' + str(x) + 'params' + str(y)
        runs += 1
    best = {'best_mean': gs.best_score_, "best_param":gs.best_params_}
    return results, best

#### grid search new parameters

In [None]:
# param_test1 = {
#  'max_depth':range(1,5,1),
#  'min_child_weight':np.arange(0.5,3.0,0.5)
# }
# #metrics to consider: f1_micro, f1_macro, roc_auc_ovr
# gsearch1 = GridSearchCV(estimator = xgb1, param_grid = param_test1, scoring='accuracy', n_jobs=-1, verbose = 10, cv=5)
# gsearch1.fit(X_train, y_train)
# getTrainScores(gsearch1)

In [None]:
param_test2 = {
 'max_depth':range(4,8,1),
 'min_child_weight':np.arange(1.2,2.0,0.2)
}
#metrics to consider: f1_micro, f1_macro, roc_auc_ovr
gsearch2 = GridSearchCV(estimator = xgb1, param_grid = param_test2, scoring='accuracy', n_jobs=-1, verbose = 10, cv=5)
gsearch2.fit(X_train, y_train)
getTrainScores(gsearch2)

#### second model

In [None]:
xgb2 = XGBClassifier(
    n_estimators=750,
    min_child_weight=0.81,
    learning_rate=0.025,
    max_depth=2,
    subsample=0.80,
    colsample_bytree=0.42,
    gamma=0.10,
    random_state=42,
    n_jobs=-1,
    objective='multi:softmax',
    nthread=4
)

### Feature Importance

In [None]:
# finding feature importances and creating new dataframe basen on them

feature_importance = xg.get_booster().get_score(importance_type='weight')

keys = list(feature_importance.keys())
values = list(feature_importance.values())

importance = pd.DataFrame(data=values, index=keys,
                          columns=['score']).sort_values(by='score',
                                                         ascending=False)
fig, ax = plt.subplots(figsize=(16, 10))
sns.barplot(x=importance.score.iloc[:20],
            y=importance.index[:20],
            orient='h',
            palette='Reds_r')
ax.set_title('Feature Importances')
plt.show()