In [13]:
import pandas as pd
import numpy as np
from scipy import stats
import joblib

# sql
from sqlalchemy import create_engine

#modeling
from sklearn.model_selection import train_test_split, cross_val_score
import xgboost as xgb
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from classification_functions import  conf_matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

#plotting
import matplotlib.pyplot as plt
import seaborn as sns

Final model functions and scoring

In [14]:
def XGBoost(X_train, y_train, depth, l_rate, subsample,  min_weight, col_sample):
    '''
    A function that fits xgboost model given params and prints f1 score

    Parameters
    ----------
    X_train, y_train : train data
    depth, l_rate, subsample,  min_weight, col_sample : xgboost params

    Returns
    -------
    prints params and f1 score
    returns xgboost model
    '''
    params = { 
                'n_estimators': 20000,
                'max_depth': depth,
                'objective':'multi:softprob',
                'num_classes' :3,  
                'learning_rate': l_rate, 
                'subsample': subsample,
                'min_child_weight':min_weight,
                'colsample_bytree':col_sample,
                'random_state' : 0,
                'verbosity' : 0,
                'n_jobs' : -1}

    gbm = XGBClassifier()
    gbm.set_params(**params)
    gbm.fit(X_train, y_train)
    
    print(f'XGBoost with params:\n'
          f'max_depth = {depth}\n'
          f'learning_rate = {l_rate}\n'
          f'subsample = {subsample}\n'
          f'min_child_weight = {min_weight}\n'
          f'colsample_bytree = {col_sample}\n'
          '\n'
          f'Has an f1 score of: {round(cross_val_score(gbm, X_train, y_train, scoring='f1_macro', cv=5).mean(), 2)}'
         )
          
    return gbm

def XGBoost_baseline(X_train, y_train):
    '''
    A function that fits and returns a baseline xgboost model
    '''          
    gbm = XGBClassifier()
    gbm.fit(X_train, y_train)
          
    return gbm

def get_cv_f1(model, X_val, y_val):
    '''
    function that returns f1 score
    '''
    preds = model.predict(X_val)
    return f1_score( y_val, preds, average="macro")
    

Load data

In [15]:
#create engine
engine = create_engine('postgresql://racheldilley:localhost@localhost:5432/programer_database')

In [16]:
query = '''
SELECT "Hobbyist", "Age", "Age1stCode", "Country", "EdLevel", "Ethnicity",
       "Gender", "OpSys", "UndergradMajor", "YearsCode", "YearsCodePro",
       "database_count", "back-end", "full-stack", "front-end", "desktop",
       "mobile", "DevOps", "Database admin", "Designer", "System admin",
       "Student", "Other Occupation", "Retired Dev", "Sometimes Code at Work", "Region",
       "JavaScript", "Python", "SQL", "Java" , "HTML/CSS"
FROM cleaned_survey_data6
'''
df = pd.read_sql_query(query, engine)
df = df.dropna()

In [18]:
X = df[['database_count', 'Age1stCode', 'YearsCodePro', 'Age','YearsCode', 'EdLevel', 
         'Gender', 'UndergradMajor', 'Region', 
        'Hobbyist', 'back-end', 'full-stack', 'front-end', 'desktop', 'mobile', 'DevOps', 'Database admin', 
        'Designer','System admin', 'Student', 'Other Occupation', 'Retired Dev','Sometimes Code at Work', 
       'JavaScript', 'Python', 'SQL', 'Java', 'HTML/CSS']]
y = df['OpSys']

X = pd.get_dummies(X, drop_first = True)

### Fix Class Imbalance

Try ADASYN Oversampling

In [5]:
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(X,y)

X_train_adasyn, X_test_adasyn, y_train_adasyn, y_test_adasyn = train_test_split(X_adasyn,y_adasyn, test_size=0.2, 
                                                                                random_state=42)

xgb_adasyn = XGBoost(X_train_adasyn, y_train_adasyn, 10, 0.3, 0.8, 3, 0.8)



XGBoost with params:
max_depth = 10
learning_rate = 0.3
subsample = 0.8
min_child_weight = 3
colsample_bytree = 0.8

Has an f1 score of: 0.846


Try SMOTE Oversampling

In [6]:
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X,y)

X_train_smoted, X_test_smoted, y_train_smoted, y_test_smoted = train_test_split(X_smoted,y_smoted, test_size=0.2, 
                                                                                random_state=42)

xgb_smoted = XGBoost(X_train_smoted, y_train_smoted, 10, 0.3, 0.8, 3, 0.8)



XGBoost with params:
max_depth = 10
learning_rate = 0.3
subsample = 0.8
min_child_weight = 3
colsample_bytree = 0.8

Has an f1 score of: 0.847


Try Undersampling

In [8]:
X_under, y_under = RandomUnderSampler(random_state=42).fit_sample(X,y)
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_under,y_under, test_size=0.2, 
                                                                            random_state=42)

xgb_under = XGBoost(X_train_under, y_train_under, 10, 0.3, 0.8, 3, 0.8)



XGBoost with params:
max_depth = 10
learning_rate = 0.3
subsample = 0.8
min_child_weight = 3
colsample_bytree = 0.8

Has an f1 score of: 0.741


**SMOTE performed the best**

### Feature Selection

In [10]:
def remove_features(X_train, y_train, basef1):
    '''
    A function that removes features that either raises the f1 score or only decreases it a little

    Parameters
    ----------
    X_train, y_train : train data
    basef1 : base f1 score of balanced data

    Returns
    -------
    returns list of features to remove
    '''
    feats_remove = []
    keep_remove = 1
    col_list = list(X_train.columns)
    d = {el:0 for el in col_list} #empty dictionary with col names as keys
    
    #loop until f1 score cannot be improved anymore
    while keep_remove == 1:
        
        #loop through col_list and remove each feature, build xgboost base model and score f1
        #put f1 score in dictionary with key of removed feature
        for col in col_list: 
            removed = col
            removed_col_list = col_list.copy()
            removed_col_list.remove(removed)
            
            model = XGBoost_baseline(X_train[removed_col_list], y_train)
            d[removed] = round(cross_val_score(model, X_train[removed_col_list], y_train, scoring='f1_macro', cv=5).mean(), 2)
        
        #find max f1 value in dictionary 
        max_f1_key = max(d, key=d.get)
        max_f1_val = d[max_f1_key]
        
        #check if max f1 is close to baseline f1 and 
        if max_f1_val >= (basef1-0.3):
            feats_remove.append(max_f1_key)
            col_list = col_list.remove(max_f1_key)
            d = {el:0 for el in col_list}
            basef1 = max_f1_val
            print(max_f1_key, max_f1_val)
        else:
            keep_remove = 0
    
    return feats_remove

In [11]:
xgb_smoted_base = XGBoost_baseline(X_train_smoted, y_train_smoted)
basef1 = round(cross_val_score(xgb_smoted_base, X_train, y_train, scoring='f1_macro', cv=5).mean(), 2)
print(basef1)

0.6401506528729594


In [12]:
feats_remove = remove_features(X_train_smoted, y_train_smoted, basef1)
print(feats_remove)

IndexError: list index out of range

In [None]:
X_train_adasyn.drop(labels= feats_remove , axis=1, inplace=True)
X_test_adasyn.drop(labels= feats_remove , axis=1, inplace=True)

### Hyperparameter Tuning

In [None]:
#increase max_depth
xgb2_smoted = XGBoost(X_train_smoted, y_train_smoted, 30, 0.3, 0.8, 3, 0.8)

In [None]:
#increase max_depth more
xgb3_smoted = XGBoost(X_train_smoted, y_train_smoted , 100, 0.3, 0.8, 3, 0.8)

In [None]:
#decrease max depth & increase n_estimators
xgb4_smoted = XGBoost(X_train_smoted, y_train_smoted, 20, 0.3, 0.8, 3, 0.8)

In [None]:
#decrease learning rate
xgb5_smoted = XGBoost(X_train_smoted, y_train_smoted , 20, 0.05, 0.8, 3, 0.8)

In [None]:
#decrease subsample
xgb6_smoted = XGBoost(X_train_smoted, y_train_smoted , 20, 0.05, 0.7, 3, 0.8)

### Final Model

Choose xgb_adasyn with an f1 score of: 0.85

In [None]:
probs = xgb5_smoted.predict(X_test_smoted)
conf = confusion_matrix(y_test_smoted, probs)
plt.figure(figsize=(6,6))
conf1 = sns.heatmap(conf, cmap=plt.cm.get_cmap('Blues'), annot=True, square=True, fmt='d',
               xticklabels=['Windows', 'MacOS', 'Linux'],
               yticklabels=['Windows', 'MacOS', 'Linux'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion matrix')
conf1.figure.savefig('../Graphs/confmatrix.png')

Score final model on test data

In [None]:
print(f1_score( y_test_smoted, probs, average='macro'))
print(precision_score( y_test_smoted, probs, average='macro'))
print(recall_score( y_test_smoted, probs, average='macro'))

### Final Model Analysis

Pickle final model

In [None]:
# with open("../Models/xgb_balanced.pkl", "wb") as f:
#     pkl.dump(xgb5_smoted, f)

In [None]:
X_train_adasyn.columns