# Classifier Evaluation Lab

* Copy&paste your model for homework5 model
* Add grid search and train
* Compare performance
* Which one is better? Explain?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/log_reg/employee-turnover-balanced.csv')
df.head()

Unnamed: 0,left_company,age,frequency_of_travel,department,commuting_distance,education,satisfaction_with_environment,gender,seniority_level,position,satisfaction_with_job,married_or_single,last_raise_pct,last_performance_rating,total_years_working,years_at_company,years_in_current_job,years_since_last_promotion,years_with_current_supervisor
0,No,37,Travel_Rarely,Sales,16,4,4,Male,2,Sales Executive,3,Divorced,19,3,9,1,0,0,0
1,No,39,Travel_Rarely,Research & Development,3,2,3,Male,2,Laboratory Technician,3,Divorced,15,3,11,10,8,0,7
2,No,52,Travel_Frequently,Research & Development,25,4,3,Female,4,Manufacturing Director,4,Married,22,4,31,9,8,0,0
3,No,50,Non-Travel,Sales,1,3,4,Female,2,Sales Executive,3,Married,12,3,19,18,7,0,13
4,No,44,Travel_Rarely,Research & Development,4,3,4,Male,2,Healthcare Representative,2,Single,12,3,10,5,2,2,3


In [2]:
#model from homework 5
numerical_vars = ['age', 'commuting_distance', 'education', 'satisfaction_with_environment', 'seniority_level', 'satisfaction_with_job', 'last_raise_pct', 'last_performance_rating', 'total_years_working', 'years_at_company', 'years_since_last_promotion', 'years_with_current_supervisor']
categorical_Vars = ['frequency_of_travel', 'department', 'gender', 'position' , 'married_or_single']

Xtrain, Xtest, ytrain, ytest = train_test_split(
    df[numerical_vars + categorical_Vars],
    df['left_company'],
    test_size=0.2,
    random_state=124
)


num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_vars),
    ('cat', cat_pipeline, categorical_Vars)
])

logreg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000))
])

logreg_pipeline.fit(Xtrain, ytrain)

train_preds = logreg_pipeline.predict(Xtrain)
test_preds = logreg_pipeline.predict(Xtest)

train_acc = accuracy_score(ytrain, train_preds)
test_acc = accuracy_score(ytest, test_preds)

print(f"Training accuracy: {train_acc}")
print(f"Test accuracy: {test_acc}")

Training accuracy: 0.72625
Test accuracy: 0.655


In [3]:
#grid search for hyper paremeter tuning
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'logreg__C': [0.1, 1, 10],
    'logreg__solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
}


grid_search = GridSearchCV(logreg_pipeline, param_grid, cv=5, n_jobs=-1)

grid_search.fit(Xtrain, ytrain)

print("Best hyperparameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

Best hyperparameters:  {'logreg__C': 0.1, 'logreg__solver': 'liblinear', 'preprocessor__num__imputer__strategy': 'mean'}
Best accuracy score:  0.6950000000000001


In [4]:
#accuracy on test and train serts
train_preds = grid_search.predict(Xtrain)
test_preds = grid_search.predict(Xtest)

train_acc = accuracy_score(ytrain, train_preds)
test_acc = accuracy_score(ytest, test_preds)

print(f"Training accuracy: {train_acc}")
print(f"Test accuracy: {test_acc}")

Training accuracy: 0.72875
Test accuracy: 0.68


We can see that the model with grid search achieved higher scores on all evaluation metrics, indicating that it performed better than the original model. This is expected because grid search helps to find the best hyperparameters for the model, leading to better performance.