This notebook documents k-fold cross validation process that helps with model selection.

In [1]:
# Import dependencies.
import pandas as pd
from numpy import mean
from numpy import std

from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Read in the data.
df = pd.read_csv('../resources/cleaned_mode.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    19158 non-null  object 
 1   city_development_index  19158 non-null  float64
 2   gender                  19158 non-null  object 
 3   relevent_experience     19158 non-null  int64  
 4   enrolled_university     19158 non-null  object 
 5   education_level         19158 non-null  object 
 6   major_discipline        19158 non-null  object 
 7   experience              19158 non-null  object 
 8   company_size            19158 non-null  object 
 9   company_type            19158 non-null  object 
 10  last_new_job            19158 non-null  object 
 11  training_hours          19158 non-null  int64  
 12  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(9)
memory usage: 1.9+ MB


In [3]:
model_df = pd.get_dummies(df)
model_df.head()

Unnamed: 0,city_development_index,relevent_experience,training_hours,target,city_Other,city_city_100,city_city_102,city_city_103,city_city_104,city_city_11,...,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd,last_new_job_1,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_>4,last_new_job_never
0,0.92,1,36,1.0,0,0,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
1,0.776,0,47,0.0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0.624,0,83,0.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0.789,0,52,1.0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,0.767,1,8,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
# Split the data into a training set and a testing set.
y = model_df.target
X = model_df.drop(columns='target')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Instantiate a StandardScaler instance.
scaler = StandardScaler()

# Fit the training data to the standard scaler.
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler.
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler.
X_test_scaled = X_scaler.transform(X_test)

In [5]:
# Due to imbalanced target values, instantiate the random oversampler model.
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_train_scaled, y_train)

## Logistic Regression

In [6]:
# Implement a logistic regression model.
classifier = LogisticRegression(solver='lbfgs', random_state=42)
classifier.fit(X_res, y_res)
predictions = classifier.predict(X_test_scaled)

In [7]:
# Display the accuracy score for the test dataset.
print(f'Accuracy score: {accuracy_score(y_test, predictions):.3f}')

Accuracy score: 0.743


In [8]:
# Display the confusion matrix.
confusion_matrix(y_test, predictions)

array([[2678,  918],
       [ 313,  881]], dtype=int64)

| n=19158 | Predicted: Staying | Predicted: Leaving |
| --- | --- | --- |
| Actual: Staying | 2678 | 918 |
| Actual: Leaving | 313 | 881 |


In [9]:
# Display the classification report.
target_names = ['stay', 'leave']
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

        stay       0.90      0.74      0.81      3596
       leave       0.49      0.74      0.59      1194

    accuracy                           0.74      4790
   macro avg       0.69      0.74      0.70      4790
weighted avg       0.79      0.74      0.76      4790



### K-Fold Cross Validation

In [10]:
# Code is sourced from https://machinelearningmastery.com/how-to-configure-k-fold-cross-validation/.
# Prepare the cross-validation procedure.
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# Create model.
model = LogisticRegression()

# Evaluate model.
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# Report performance.
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.771 (0.012)


Cross validation demonstrates that using logistic regression results in an estimated classification accuracy of about 77.1%, which is close to and above above 74% model accuracy formerly achieve.

## Random Forest Classifier Model

In [11]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)
rf_model = rf_model.fit(X_res, y_res)
predictions = rf_model.predict(X_test_scaled)

# Display the accuracy score for the test dataset.
print(f'Accuracy score: {accuracy_score(y_test, predictions):.3f}')

Accuracy score: 0.749


In [12]:
# Display the confusion matrix.
confusion_matrix(y_test, predictions)

array([[3024,  572],
       [ 632,  562]], dtype=int64)

In [13]:
# Display the classification report.
target_names = ['stay', 'leave']
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

        stay       0.83      0.84      0.83      3596
       leave       0.50      0.47      0.48      1194

    accuracy                           0.75      4790
   macro avg       0.66      0.66      0.66      4790
weighted avg       0.74      0.75      0.75      4790



### K-Fold Cross Validation

In [14]:
# Prepare the cross-validation procedure.
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# Create model.
model = RandomForestClassifier()

# Evaluate model.
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# Report performance.
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.757 (0.008)


Cross validation demonstrates that using random forest classifier results in an estimated classification accuracy of about 75.7%, which is again close to and above 75.0% model accuracy formerly achieved.