In [112]:
# import all of the libraries
import pandas as pd
import numpy as np
import pickle

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

pd.options.display.max_seq_items = 2000

In [113]:
Location = r"..\data\SHData_v3.csv"
df = pd.read_csv(Location)
print(df.shape)
print(df.columns)
df.sample(5)

(4031, 29)
Index(['StudentId', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9',
       'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19',
       'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27', 'House'],
      dtype='object')


Unnamed: 0,StudentId,C1,C2,C3,C4,C5,C6,C7,C8,C9,...,C19,C20,C21,C22,C23,C24,C25,C26,C27,House
2380,2381,A,A,A,D,A,C,A,A,A,...,A,A,A,B,A,A,A,A,C,Ravenclaw
2548,2549,A,E,A,A,A,A,B,A,A,...,A,D,A,C,A,A,B,A,A,Ravenclaw
3920,3921,A,A,E,A,C,A,A,A,A,...,B,A,A,A,B,A,C,A,A,Slytherin
717,718,A,C,A,A,A,A,A,B,A,...,A,A,C,A,B,A,A,C,A,Gryffindor
2149,2150,A,A,A,C,E,A,A,A,A,...,A,D,A,A,C,A,A,B,A,Ravenclaw


### convert categorical features into 0/1 features (one-hot encoding)

In [114]:
# generate binary values using get_dummies (one-hot encoding)
col_list = ['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10', \
 'C11','C12','C13','C14','C15','C16','C17','C18','C19','C20',\
 'C21','C22','C23','C24','C25','C26','C27']
df_d = pd.get_dummies(df, columns = col_list )

In [115]:
pickle.dump( df_d, open( "C:\_2_Python\Code\SortingHat\Data\df_d.pickle", "wb" ) )

In [116]:
#reloading the pickled data frame
df_d = pickle.load( open( "C:\_2_Python\Code\SortingHat\Data\df_d.pickle", "rb" ) )

In [117]:
df_d.columns

Index(['StudentId', 'House', 'C1_A', 'C1_B', 'C1_C', 'C1_D', 'C1_E', 'C2_A',
       'C2_B', 'C2_C', 'C2_D', 'C2_E', 'C3_A', 'C3_B', 'C3_C', 'C3_D', 'C3_E',
       'C4_A', 'C4_B', 'C4_C', 'C4_D', 'C4_E', 'C5_A', 'C5_B', 'C5_C', 'C5_D',
       'C5_E', 'C6_A', 'C6_B', 'C6_C', 'C6_D', 'C6_E', 'C7_A', 'C7_B', 'C7_C',
       'C7_D', 'C7_E', 'C8_A', 'C8_B', 'C8_C', 'C8_D', 'C8_E', 'C9_A', 'C9_B',
       'C9_C', 'C9_D', 'C9_E', 'C10_A', 'C10_B', 'C10_C', 'C10_D', 'C10_E',
       'C10_F', 'C10_G', 'C11_A', 'C11_B', 'C11_C', 'C11_D', 'C11_E', 'C11_F',
       'C11_G', 'C12_A', 'C12_B', 'C12_C', 'C12_D', 'C12_E', 'C12_F', 'C13_A',
       'C13_B', 'C13_C', 'C13_D', 'C13_E', 'C13_F', 'C13_G', 'C14_A', 'C14_B',
       'C14_C', 'C14_D', 'C14_E', 'C14_F', 'C14_G', 'C14_H', 'C15_A', 'C15_B',
       'C15_C', 'C15_D', 'C15_E', 'C15_F', 'C15_G', 'C15_H', 'C16_A', 'C16_B',
       'C16_C', 'C16_D', 'C16_E', 'C17_A', 'C17_B', 'C17_C', 'C17_D', 'C17_E',
       'C18_A', 'C18_B', 'C18_C', 'C18_D', 'C18_E', 'C19_

In [118]:
print(df_d.shape)
df_d.sample(5)

(4031, 138)


Unnamed: 0,StudentId,House,C1_A,C1_B,C1_C,C1_D,C1_E,C2_A,C2_B,C2_C,...,C24_C,C25_A,C25_B,C25_C,C26_A,C26_B,C26_C,C27_A,C27_B,C27_C
3610,3611,Slytherin,1,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
1695,1696,Hufflepuff,1,0,0,0,0,0,0,1,...,0,1,0,0,1,0,0,0,0,1
2013,2014,Ravenclaw,1,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
2304,2305,Ravenclaw,0,0,1,0,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
1131,1132,Hufflepuff,0,0,1,0,0,1,0,0,...,1,1,0,0,0,1,0,1,0,0


In [119]:
#checking the House values
df_d['House'].unique()

array(['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin'], dtype=object)

In [120]:
#turning house names into numerical values
df_d.loc[df_d['House']=='Gryffindor','House'] = 0
df_d.loc[df_d['House']=='Hufflepuff','House'] = 1
df_d.loc[df_d['House']=='Ravenclaw','House'] = 2
df_d.loc[df_d['House']=='Slytherin','House'] = 3
df_d['House']=df_d['House'].astype('uint8')

In [121]:
df_d.head(5)

Unnamed: 0,StudentId,House,C1_A,C1_B,C1_C,C1_D,C1_E,C2_A,C2_B,C2_C,...,C24_C,C25_A,C25_B,C25_C,C26_A,C26_B,C26_C,C27_A,C27_B,C27_C
0,1,0,1,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,1
1,2,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
2,3,0,1,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,4,0,1,0,0,0,0,1,0,0,...,1,1,0,0,0,0,1,1,0,0
4,5,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0


In [122]:
# makes a list of all the columns (features) that we are going to use for training
flist = ['C1_A','C1_B','C1_C','C1_D','C1_E','C2_A','C2_B','C2_C','C2_D','C2_E',
         'C3_A','C3_B','C3_C','C3_D','C3_E','C4_A','C4_B','C4_C','C4_D','C4_E',
         'C5_A','C5_B','C5_C','C5_D','C5_E','C6_A','C6_B','C6_C','C6_D','C6_E',
         'C7_A','C7_B','C7_C','C7_D','C7_E','C8_A','C8_B','C8_C','C8_D','C8_E',
         'C9_A','C9_B','C9_C','C9_D','C9_E','C10_A','C10_B','C10_C','C10_D','C10_E','C10_F','C10_G',
         'C11_A','C11_B','C11_C','C11_D','C11_E','C11_F','C11_G','C12_A','C12_B','C12_C','C12_D','C12_E','C12_F',
         'C13_A','C13_B','C13_C','C13_D','C13_E','C13_F','C13_G',
         'C14_A','C14_B','C14_C','C14_D','C14_E','C14_F','C14_G','C14_H',
         'C15_A','C15_B','C15_C','C15_D','C15_E','C15_F','C15_G','C15_H',
         'C16_A','C16_B','C16_C','C16_D','C16_E','C17_A','C17_B','C17_C','C17_D','C17_E',
         'C18_A','C18_B','C18_C','C18_D','C18_E','C19_A','C19_B','C19_C','C19_D','C19_E',
         'C20_A','C20_B','C20_C','C20_D','C20_E','C21_A','C21_B','C21_C','C21_D','C21_E',
         'C22_A','C22_B','C22_C','C23_A','C23_B','C23_C','C24_A','C24_B','C24_C','C25_A','C25_B','C25_C',
         'C26_A','C26_B','C26_C','C27_A','C27_B','C27_C']

In [123]:
X = df_d[flist]     

# "house" is already a column and we're singling it out and tells the model to use it as a label for the result
y = df_d['House']   

#split the data into 80% training data, 20% test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [124]:
# make model
xgb_model = xgb.XGBClassifier(disable_default_ava1_metric=1
                             , booster = 'gbtree'
                             , objective = 'reg:logistic'
                             , random_state = 42
                             , max_depth = 20
                             , learning_rate = 0.2
                             , eval_metric=["auc"]
                             , use_label_encoder=False)

In [125]:
# trains the classifier with the training data
xgb_model.fit(X_train, y_train)

Parameters: { disable_default_ava1_metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1,
       disable_default_ava1_metric=1, eval_metric=['auc'], gamma=0,
       gpu_id=-1, importance_type='gain', interaction_constraints='',
       learning_rate=0.2, max_delta_step=0, max_depth=20,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=8, num_parallel_tree=1,
       objective='multi:softprob', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=None, subsample=1,
       tree_method='exact', use_label_encoder=False, validate_parameters=1,
       verbosity=None)

### train data

In [126]:
# generates its predictions
y_pred = xgb_model.predict(X_train)

In [127]:
y_train.head(5)

1164    1
2553    2
2667    2
3299    3
1738    1
Name: House, dtype: uint8

In [128]:
y_pred[0:5]

array([1, 2, 2, 3, 1], dtype=int64)

In [129]:
# ideally, only the diagonal would have nonzero numbers, but this isn't bad
print(confusion_matrix(y_train, y_pred))

[[731   0   0   0]
 [  0 718   0   0]
 [  0   0 999   0]
 [  0   0   0 776]]


In [130]:
# basically an accuracy report
print(classification_report(y_train, y_pred)) 

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       731
          1       1.00      1.00      1.00       718
          2       1.00      1.00      1.00       999
          3       1.00      1.00      1.00       776

avg / total       1.00      1.00      1.00      3224



### test data

In [131]:
y_pred = xgb_model.predict(X_test)

In [132]:
print(confusion_matrix(y_test, y_pred))

[[164  10  21   6]
 [  6 168  13   5]
 [  6  17 194  12]
 [ 12  13  13 147]]


In [133]:
print(classification_report(y_test, y_pred)) 

             precision    recall  f1-score   support

          0       0.87      0.82      0.84       201
          1       0.81      0.88      0.84       192
          2       0.80      0.85      0.83       229
          3       0.86      0.79      0.83       185

avg / total       0.84      0.83      0.83       807

