SVM: HR dataset

In [4]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, r2_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.compose import make_column_selector, make_column_transformer

In [2]:
df = pd.read_csv(r"C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\Cases\human-resources-analytics\HR_comma_sep.csv")
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.1,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.0,5,224,5,0,1,0,sales,low


In [3]:
X = df.drop(columns=['left'])
y = df['left'].astype('category')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14995 entries, 0 to 14994
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14995 non-null  float64
 1   last_evaluation        14995 non-null  float64
 2   number_project         14995 non-null  int64  
 3   average_montly_hours   14995 non-null  int64  
 4   time_spend_company     14995 non-null  int64  
 5   Work_accident          14995 non-null  int64  
 6   promotion_last_5years  14995 non-null  int64  
 7   Department             14995 non-null  object 
 8   salary                 14995 non-null  object 
dtypes: float64(2), int64(5), object(2)
memory usage: 1.0+ MB


Linear Kernel

In [14]:
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')

ct = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=[
        'object',
        'category',
    ])),
    (ohe, make_column_selector(dtype_include=[
        'object',
        'category',
    ])),
    verbose_feature_names_out=False,
).set_output(transform='pandas')

In [None]:
#Linear SVM
from sklearn.discriminant_analysis import StandardScaler
from sklearn.svm import SVC

scale_std = StandardScaler()

svm = SVC(kernel='linear', probability=True,random_state=24)
pipe = Pipeline([
    ('CT', ct),
    ('SCL', scale_std),
    ('SVC', svm)
])
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
params = {
    'SVC__C': np.linspace(0.001, 5, 3),
}
gcv = GridSearchCV(estimator=pipe, param_grid=params, scoring='roc_auc', cv=kfold)
 

In [24]:
gcv.fit(X_train, y_train)

In [25]:
print(f'Best params: {gcv.best_params_}')
print(f'Best score: {gcv.best_score_}')

Best params: {'SVC__C': 5.0}
Best score: 0.8091627936188198


Polynomial SVM

In [26]:
#Linear SVM
from sklearn.discriminant_analysis import StandardScaler
from sklearn.svm import SVC

scale_std = StandardScaler()

svm = SVC(kernel='poly', probability=True,random_state=24)
pipe = Pipeline([
    ('CT', ct),
    ('SCL', scale_std),
    ('SVC', svm)
])
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
params = {
    'SVC__C': np.linspace(0.001, 5, 3), 'SVC__degree': [2,3]
}
gcv_poly = GridSearchCV(estimator=pipe, param_grid=params, scoring='roc_auc', cv=kfold)
 

In [27]:
gcv_poly.fit(X_train, y_train)

In [28]:
print(f'Best params: {gcv_poly.best_params_}')
print(f'Best score: {gcv_poly.best_score_}')

Best params: {'SVC__C': 5.0, 'SVC__degree': 3}
Best score: 0.9594970322430351


Radial SVM

In [29]:
#Linear SVM
from sklearn.discriminant_analysis import StandardScaler
from sklearn.svm import SVC

scale_std = StandardScaler()

svm = SVC(kernel='rbf', probability=True,random_state=24)
pipe = Pipeline([
    ('CT', ct),
    ('SCL', scale_std),
    ('SVC', svm)
])
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
params = {
    'SVC__C': np.linspace(0.001, 5, 3),
    'SVC__gamma': np.linspace(0.001, 5, 3)
}
gcv_rad = GridSearchCV(estimator=pipe, param_grid=params, scoring='roc_auc', cv=kfold)
 

In [None]:
gcv_rad.fit(X_train, y_train)

In [None]:
print(f'Best params: {gcv_rad.best_params_}')
print(f'Best score: {gcv_rad.best_score_}')