In [20]:
import pandas as pd
import numpy as np
import pickle as pkl
from scipy import stats

#modeling
from sklearn.model_selection import train_test_split
import xgboost as xgb
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from classification_functions import  conf_matrix, plot_roc
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

#plotting
import matplotlib.pyplot as plt

In [23]:
def XGBoost(X_train, y_train, X_val, y_val, depth, l_rate, subsample,  min_weight, col_sample):
              
    # rand_param = {
    #                 'n_estimators': [30000], 
    #                 'max_depth': [3,7],
    #                 'objective': ["reg:squarederror"],
    #                 'learning_rate': [0.05, .2], 
    #                 'subsample': [0.5, 0.8],
    #                 'min_child_weight': [1, 8],
    #                 'colsample_bytree': [0.5, 0.8]
    #              }

    params = { 
                'n_estimators': 10000,
                'max_depth': depth,
                'objective':'multi:softprob',
                'num_classes' :3,  
                #'use_label_encoder':False,
                'learning_rate': l_rate, 
                'subsample': subsample,
                'min_child_weight':min_weight,
                'colsample_bytree':col_sample,
                'random_state' : 0,
                'verbosity' : 0,
                'n_jobs' : -1}

    gbm = XGBClassifier()
    gbm.set_params(**params)
    gbm.fit(X_train, y_train)
    preds = gbm.predict(X_val)
    
    print(f'XGBoost with params:\n'
          f'max_depth = {depth}\n'
          f'learning_rate = {l_rate}\n'
          f'subsample = {subsample}\n'
          f'min_child_weight = {min_weight}\n'
          f'colsample_bytree = {col_sample}\n'
          '\n'
          f'Has an f1 score of: {round(f1_score( y_val, preds, average="macro"), 3)}'
         )
          
    return gbm

Load data

In [4]:
df = pd.read_pickle('../Data/survey_data_cleaned2.pkl')
#df.info()

In [5]:
X = df[['database_count', 'Age1stCode', 'YearsCodePro', 'Age','YearsCode', 'EdLevel', 
           'Ethnicity', 'Gender', 'UndergradMajor', 'Region', 
        'Hobbyist', 'back-end', 'full-stack', 'front-end', 'desktop', 'mobile', 'DevOps', 'Database admin', 
        'Designer','System admin', 'Student', 'Other Occupation', 'Retired Dev','Sometimes Code at Work', 
       'JavaScript', 'Python', 'SQL', 'Java', 'HTML/CSS']]
y = df['OpSys']

X = pd.get_dummies(X)

### Fix Class Imbalance

Try ADASYN Oversampling

In [24]:
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(X,y)

X_train_adasyn, X_test_adasyn, y_train_adasyn, y_test_adasyn = train_test_split(X_adasyn,y_adasyn, test_size=0.2, random_state=42)
X_train_adasyn, X_val_adasyn, y_train_adasyn, y_val_adasyn = train_test_split(X_train_adasyn,y_train_adasyn, test_size=0.25, random_state=42)

xgb_adasyn = XGBoost(X_train_adasyn, y_train_adasyn,X_val_adasyn, y_val_adasyn, 10, 0.3, 0.8, 3, 0.8)



XGBoost with params:
max_depth = 10
learning_rate = 0.3
subsample = 0.8
min_child_weight = 3
colsample_bytree = 0.8

Has an f1 score of: 0.869


Try SMOTE Oversampling

In [None]:
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X,y)

X_train_smoted, X_test_smoted, y_train_smoted, y_test_smoted = train_test_split(X_smoted,y_smoted, test_size=0.2, random_state=42)
X_train_smoted, X_val_smoted, y_train_smoted, y_val_smoted = train_test_split(X_train_smoted, y_train_smoted, test_size=0.25, random_state=42)

xgb_smoted = XGBoost(X_train_smoted, y_train_smoted, X_val_smoted, y_val_smoted, 10, 0.3, 0.8, 3, 0.8)



Try Undersampling

In [27]:
X_under, y_under = RandomUnderSampler(random_state=42).fit_sample(X,y)
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_under,y_under, test_size=0.2, random_state=42)
X_train_under, X_val_under, y_train_under, y_val_under = train_test_split(X_train_under,y_train_under, test_size=0.25, random_state=42)

xgb_under = XGBoost(X_train_under, y_train_under, X_val_under, y_val_under, 10, 0.3, 0.8, 3, 0.8)

TypeError: XGBoost() missing 7 required positional arguments: 'X_val', 'y_val', 'depth', 'l_rate', 'subsample', 'min_weight', and 'col_sample'

SMOTE performed the best

### Feature Engineering

### Hyperparameter Tuning

### Final Model

In [None]:
xgb.plot_tree(xgb_smoted, num_trees=0)
plt.rcParams['figure.figsize'] = [50, 10]
plt.show()

In [None]:
xgb_smoted.plot_importance()

Pickle final model

In [None]:
with open("../Models/xgb_balanced.pkl", "wb") as f:
    pkl.dump(xgb_smoted, f)