This notebook first evaluates feature importance for the logistic regression model. Once the most important features are extracted, they serve as input for cross-validation to ensure minimal loss of model accuracy in terms of ROC AUC and recall.

In [1]:
# Import dependencies.
import pandas as pd
from numpy import mean, std

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as imbpipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from project_pipeline import preprocess, lr_model

In [2]:
# Read in 'cleaned_mode.csv' data.
df = pd.read_csv('../resources/cleaned_mode.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    19158 non-null  object 
 1   city_development_index  19158 non-null  float64
 2   gender                  19158 non-null  object 
 3   relevent_experience     19158 non-null  int64  
 4   enrolled_university     19158 non-null  object 
 5   education_level         19158 non-null  object 
 6   major_discipline        19158 non-null  object 
 7   experience              19158 non-null  object 
 8   company_size            19158 non-null  object 
 9   company_type            19158 non-null  object 
 10  last_new_job            19158 non-null  object 
 11  training_hours          19158 non-null  int64  
 12  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(9)
memory usage: 1.9+ MB


## Feature Importance

In [3]:
# Preprocess the cleaned data in preparation for model implementation with get dummies.
features = pd.get_dummies(df).drop(columns='target').columns.tolist()

# Split data into train and test.
X_train, X_test, y_train, y_test = preprocess(df)

In [4]:
# Implement a logistic regression model 
# (since based on cross validation it has the highest ROC AUC and recall scores) 
# to extract feature importance.
lr_model = LogisticRegression(solver='lbfgs', random_state=42)
lr_model.fit(X_train, y_train)

In [5]:
# Obtain the weights of the trained model.
importance = lr_model.coef_[0]

In [6]:
# Print out weights whose absolute values are greater than 0.1.
for i, v in enumerate(importance):
    if abs(v)>=0.1:
        print(f'Feature: {features[i]}, Score: {v:.3f}')

Feature: city_development_index, Score: -0.651
Feature: relevent_experience, Score: -0.232
Feature: city_Other, Score: -0.158
Feature: city_city_103, Score: 0.154
Feature: city_city_114, Score: -0.105
Feature: city_city_136, Score: -0.109
Feature: city_city_21, Score: 0.229
Feature: education_level_Graduate, Score: 0.117
Feature: education_level_High School, Score: -0.178
Feature: education_level_Primary School, Score: -0.119
Feature: company_size_100-500, Score: -0.125
Feature: company_size_50-99, Score: 0.270
Feature: company_type_Funded Startup, Score: -0.136
Feature: last_new_job_never, Score: -0.133


## Feature Selection

In [7]:
# Print out the weights of the trained model.
selector = SelectFromModel(estimator=LogisticRegression()).fit(X_train, y_train)
selector.estimator_.coef_

array([[-0.65140121, -0.23167118, -0.0514573 , -0.15821284,  0.0521078 ,
        -0.05837959,  0.15449212, -0.0533812 ,  0.00552392, -0.10526186,
        -0.10890443, -0.06622287,  0.08645613,  0.22879413, -0.07164638,
         0.00220629, -0.04281134, -0.03367829,  0.00563469, -0.01538898,
         0.02623327,  0.08487657, -0.07754639,  0.00138743,  0.11699965,
        -0.17804317,  0.03398504, -0.00639011, -0.11867378,  0.02063516,
         0.00733451, -0.01489595,  0.00509301, -0.02659591,  0.00870619,
         0.03107172,  0.02823662, -0.00310362, -0.036975  ,  0.00620151,
        -0.12540064, -0.09570847,  0.26984747, -0.08635383, -0.04510293,
        -0.06316184, -0.09091874, -0.04814895, -0.13560523,  0.00227551,
         0.04807361,  0.05006992,  0.06285272,  0.00693116,  0.05197182,
         0.03980949,  0.03650156,  0.01431254, -0.13315972]])

In [8]:
# Print out the important features selected.
fs = [features[i] for i, x in enumerate(selector.get_support()) if x == True]
fs

array([ True,  True, False,  True, False, False,  True, False, False,
        True,  True, False,  True,  True, False, False, False, False,
       False, False, False,  True,  True, False,  True,  True, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True,  True,  True, False,
       False,  True, False,  True, False, False, False, False, False,
       False, False, False, False,  True])

In [9]:
# # Transform the training and testing features using the selector.
# X_train_fs = selector.transform(X_train)
# X_test_fs = selector.transform(X_test)

In [11]:
# Use the extracted features to reconstruct cross-validation input.
df_fs = pd.get_dummies(df)[fs + ['target']]
y = df_fs.target
X = df_fs.drop(columns='target')

In [12]:
# Create models variable to include all models.
models = {
    'lr': LogisticRegression(solver='lbfgs', random_state=42),
    'svc': SVC(kernel='rbf', random_state=42),
    'rf': RandomForestClassifier(n_estimators=500, random_state=42)
}

In [13]:
for name, model in models.items():
    # Prepare the cross-validation procedure.
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

    # Create a pipeline that includes oversampler, scaler, and model.
    clf = imbpipeline(steps = [['oversampler', RandomOverSampler(random_state=42)],
                               ['scaler', StandardScaler()],
                               ['classifer', model]])

    # Evaluate model.
    roc_auc_scores = cross_val_score(clf, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    recall_scores = cross_val_score(clf, X, y, scoring='recall', cv=cv, n_jobs=-1)

    # Report performance.
    print(name)
    print('---')
    print(f'roc_auc: {mean(roc_auc_scores):.2f} ({std(roc_auc_scores):.2f})')
    print(f'recall: {mean(recall_scores):.2f} ({std(recall_scores):.2f})')
    print('---')

lr
---
roc_auc: 0.78 (0.01)
recall: 0.73 (0.02)
---
svc
---
roc_auc: 0.77 (0.01)
recall: 0.73 (0.03)
---
rf
---
roc_auc: 0.74 (0.01)
recall: 0.70 (0.02)
---
