In [12]:
import pandas as pd
import numpy as np
from scipy import stats
import joblib

# sql
from sqlalchemy import create_engine

#modeling
from sklearn.model_selection import train_test_split, cross_val_score
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from classification_functions import confusion_matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier

#plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
%config InlineBackend.figure_formats = ['retina']  # or svg
%matplotlib inline

#warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
plt.style.use('ggplot')
colors = ['#e898ac', '#00cfcc', '#ff9973', '#002845']

Final model functions and scoring

In [None]:
def random_forest(X_train, y_train):
    '''
    A fucntion for fitting random forest model
    performs randomized cross validation search to find optimal hyperparameters

    Parameters
    ----------
    train data

    Returns
    -------
    prints cross validation classification metrics and params 
    returns random forest model 
    '''
    rf = RandomForestClassifier()
    #params to search
    rand_param = {
                    'n_estimators': [800, 1500],
                    'max_features': ['sqrt'],
                    'max_depth' : [ 8, 15, 20],
                    'min_samples_leaf': [3, 5],
                    'min_samples_split': [6, 10]
                }
    rs = RandomizedSearchCV(rf, param_distributions= rand_param, cv=5, n_iter=10, n_jobs=-1)
    rs.fit(X_train, y_train)
    
    f1 = round(cross_val_score(rs, X_train, y_train, scoring='f1_macro', cv=5).mean(), 2)
    print(f'Random Forest with params:\n')
    print(rs.best_params_)
    print(f'Has an f1 score of: {f1}')
          
    return rs

def rf_baseline(X_train, y_train):
    '''
    A function that fits and returns a baseline xgboost model
    '''          
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
          
    return rf

def get_cv_f1(model, X_val, y_val):
    '''
    function that returns f1 score
    '''
    preds = model.predict(X_val)
    return f1_score( y_val, preds, average="macro")
    

Load data

In [2]:
#create engine
engine = create_engine('postgresql://racheldilley:localhost@localhost:5432/programer_database')

In [3]:
query = '''
SELECT "Hobbyist", "Age", "Age1stCode", "Country", "EdLevel", "Ethnicity",
       "Gender", "OpSys", "UndergradMajor", "YearsCode", "YearsCodePro",
       "database_count", "back-end", "full-stack", "front-end", "desktop",
       "mobile", "DevOps", "Database admin", "Designer", "System admin",
       "Student", "Other Occupation", "Retired Dev", "Sometimes Code at Work", "Region",
       "JavaScript", "Python", "SQL", "Java" , "HTML/CSS"
FROM cleaned_survey_data6
'''
df = pd.read_sql_query(query, engine)
df = df.dropna()

In [4]:
X = df[['database_count', 'Age1stCode', 'YearsCodePro', 'Age','YearsCode', 'EdLevel', 
         'Gender', 'UndergradMajor', 'Region', 
        'Hobbyist', 'back-end', 'full-stack', 'front-end', 'desktop', 'mobile', 'DevOps', 'Database admin', 
        'Designer','System admin', 'Student', 'Other Occupation', 'Retired Dev','Sometimes Code at Work', 
       'JavaScript', 'Python', 'SQL', 'Java', 'HTML/CSS']]
y = df['OpSys']

X = pd.get_dummies(X, drop_first = True)

### Fix Class Imbalance

SMOTE Oversampling

In [5]:
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X,y)

X_train_smoted, X_test_smoted, y_train_smoted, y_test_smoted = train_test_split(X_smoted,y_smoted, test_size=0.2, 
                                                                                random_state=42)

### Feature Selection

In [16]:
def remove_features(X_train, y_train, basef1):
    '''
    A function that removes features that either raises the f1 score or only decreases it a little

    Parameters
    ----------
    X_train, y_train : train data
    basef1 : base f1 score of balanced data

    Returns
    -------
    returns list of features to remove
    '''
    feats_remove = []
    keep_remove = 1
    col_list = list(X_train.columns)
    d = {el:0 for el in col_list} #empty dictionary with col names as keys
    
    #loop until f1 score cannot be improved anymore
    while keep_remove == 1:
        
        #loop through col_list and remove each feature, build xgboost base model and score f1
        #put f1 score in dictionary with key of removed feature
        for col in col_list: 
            removed = col
            removed_col_list = col_list.copy()
            removed_col_list.remove(removed)
            
            model = rf_baseline(X_train[removed_col_list], y_train)
            d[removed] = round(cross_val_score(model, X_train[removed_col_list], y_train, scoring='f1_macro', cv=5).mean(), 2)
        
        #find max f1 value in dictionary 
        max_f1_key = max(d, key=d.get)
        max_f1_val = d[max_f1_key]
        
        #check if max f1 is close to baseline f1 and 
        if max_f1_val >= (basef1-0.3):
            feats_remove.append(max_f1_key)
            col_list.remove(max_f1_key)
            d = {el:0 for el in col_list}
            basef1 = max_f1_val
            print(max_f1_key, max_f1_val)
        else:
            keep_remove = 0
    
    return feats_remove

In [17]:
rf_smoted_base = rf_baseline(X_train_smoted, y_train_smoted)
basef1 = round(cross_val_score(rf_smoted_base, X_train_smoted, y_train_smoted, scoring='f1_macro', cv=5).mean(), 2)
print(basef1)

0.62


In [18]:
feats_remove = remove_features(X_train_smoted, y_train_smoted, basef1)
print(feats_remove)

KeyboardInterrupt: 

In [None]:
X_train_adasyn.drop(labels= feats_remove , axis=1, inplace=True)
X_test_adasyn.drop(labels= feats_remove , axis=1, inplace=True)

### Hyperparameter Tuning

In [None]:
rf = random_forest(X_train_smoted, y_train_smoted)

### Random Forest Model

Choose xgb_adasyn with an f1 score of: 0.85

In [None]:
probs = xgb5_smoted.predict(X_test_smoted)
conf = confusion_matrix(y_test_smoted, probs)
plt.figure(figsize=(6,6))
conf1 = sns.heatmap(conf, cmap=plt.cm.get_cmap('Blues'), annot=True, square=True, fmt='d',
               xticklabels=['Windows', 'MacOS', 'Linux'],
               yticklabels=['Windows', 'MacOS', 'Linux'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion matrix')
conf1.figure.savefig('../Graphs/confmatrix.png')

Score final model on test data

In [None]:
print(f1_score( y_test_smoted, probs, average='macro'))
print(precision_score( y_test_smoted, probs, average='macro'))
print(recall_score( y_test_smoted, probs, average='macro'))

### Final Model Analysis

Pickle final model

In [None]:
# with open("../Models/xgb_balanced.pkl", "wb") as f:
#     pkl.dump(xgb5_smoted, f)

In [None]:
X_train_adasyn.columns