## Model training

## import data and libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


In [2]:
df=pd.read_csv('data/clean_hr_data.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,Female,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,Male,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,Male,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,Female,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,Male,3,...,3,4,1,6,3,3,2,2,2,2


## preparing x and y variable

In [5]:
x=df.drop(columns=["Attrition"],axis=1)
y = df["Attrition"].map({"No": 0, "Yes": 1})


In [6]:
x.head()

Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobRole,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Travel_Rarely,Sales,1,2,Life Sciences,2,Female,3,Sales Executive,...,3,1,0,8,0,1,6,4,0,5
1,49,Travel_Frequently,Research & Development,8,1,Life Sciences,3,Male,2,Research Scientist,...,4,4,1,10,3,3,10,7,1,7
2,37,Travel_Rarely,Research & Development,2,2,Other,4,Male,2,Laboratory Technician,...,3,2,0,7,3,3,0,0,0,0
3,33,Travel_Frequently,Research & Development,3,4,Life Sciences,4,Female,3,Research Scientist,...,3,3,0,8,3,3,8,7,3,0
4,27,Travel_Rarely,Research & Development,2,1,Medical,1,Male,3,Laboratory Technician,...,3,4,1,6,3,3,2,2,2,2


In [7]:
x.shape

(1470, 26)

In [13]:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

numeric_features = [
    'Age', 'DistanceFromHome', 'NumCompaniesWorked', 'PercentSalaryHike',
    'StockOptionLevel', 'TrainingTimesLastYear', 'YearsAtCompany',
    'YearsSinceLastPromotion', 'YearsWithCurrManager', 'MonthlyIncome',
    'TotalWorkingYears', 'YearsInCurrentRole'
]

label_encoding_features = [
    'Education', 'EnvironmentSatisfaction', 'JobInvolvement',
    'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction',
    'WorkLifeBalance'
]

onehot_encoding_features = [
    'BusinessTravel', 'Department', 'EducationField', 'Gender',
    'JobRole', 'MaritalStatus', 'OverTime'
]

numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder()
label_transformer=OrdinalEncoder()

preprocessor = ColumnTransformer(
   [
       ("StandardScaler",numeric_transformer,numeric_features),
       ("OneHotEncoder",oh_transformer,onehot_encoding_features),
       ("OrdinalEncoder",label_transformer,label_encoding_features)
   ]
)


In [14]:
x=preprocessor.fit_transform(x)

In [16]:
x.shape

(1470, 47)

In [8]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Attrition, dtype: int64

In [9]:
y.shape

(1470,)

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape    



((1176, 47), (1176,), (294, 47), (294,))

## as our dataset is imbalanced we have to use smote beacuse it has more no values than yes

In [18]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts().to_dict())
print("After SMOTE:", y_train_res.value_counts().to_dict())


Before SMOTE: {0: 978, 1: 198}
After SMOTE: {0: 978, 1: 978}
