# ***Data reading***

In [246]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectFromModel, RFE, SelectKBest, SelectPercentile, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold

In [247]:
df = pd.read_csv('https://raw.githubusercontent.com/santiagogz11/HRProject/main/cleanData/df.csv', index_col=0)

In [248]:
df.head(3)

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,...,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,0,0,1,0,0,1,0,1,0,0,...,131160,1.0,11,0,1.0,6,1,0,0,0
1,0,1,0,0,1,0,0,1,0,0,...,41890,0.0,23,1,6.0,3,5,1,4,1
2,0,1,0,0,1,0,0,0,0,0,...,193280,1.0,15,3,5.0,2,5,0,3,0


In [249]:
df.columns

Index(['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently',
       'BusinessTravel_Travel_Rarely', 'Department_Human Resources',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Human Resources', 'EducationField_Life Sciences',
       'EducationField_Marketing', 'EducationField_Medical',
       'EducationField_Other', 'EducationField_Technical Degree',
       'Gender_Female', 'Gender_Male', 'JobRole_Healthcare Representative',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative',
       'MaritalStatus_Divorced', 'MaritalStatus_Married',
       'MaritalStatus_Single', 'Age', 'DistanceFromHome', 'Education',
       'EmployeeID', 'JobLevel', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorki

# ***Data treatment***

In [250]:
df.drop(columns='EmployeeID', inplace=True)

In [251]:
dummies = df.columns[pd.Series(df.columns.values).str.contains('_|Attrition', regex=True)].values

In [252]:
noDummies = df.columns[~df.columns.isin(dummies)].values

In [253]:
dummies = np.delete(dummies, np.where(dummies == 'Attrition'))

In [254]:
scaler = MinMaxScaler()
X_noDummies = scaler.fit_transform(df[noDummies])

In [255]:
X_dummies = df[dummies]

In [256]:
X = pd.concat([pd.DataFrame(X_dummies), pd.DataFrame(X_noDummies, columns=noDummies)], axis = 1)

In [257]:
y = df['Attrition']

In [258]:
kfold = kfold = KFold(n_splits=10, random_state=0, shuffle=True)

# ***No Name***

## ***1. Gaussian Naive Bayes***

In [259]:
GaussianNB_model = GaussianNB()
score_NB = cross_val_score(GaussianNB_model, X, y, cv=kfold) # se puede utilizar ‘neg_root_mean_squared_error’
score_NB.mean()

0.791609977324263

## ***2. Random Forest***

In [260]:
RandomForest_model = RandomForestClassifier(n_estimators=10)
score_RF = cross_val_score(RandomForest_model, X, y, cv=kfold)
score_RF.mean()

0.9945578231292517

## ***3. Support Vector Machine***

### ***3.1 Support Vector Classifier***

In [261]:
SVC_model = SVC()
score_SVC = cross_val_score(SVC_model, X, y, cv=kfold)
score_SVC.mean()

0.863265306122449

### ***3.2 Linear Support Vector Classifier***

In [262]:
LSVC_model = LinearSVC()
score_LSVC = cross_val_score(LSVC_model, X, y, cv=kfold)
score_LSVC.mean()

0.8365079365079365

## ***4. K neighbors***

In [263]:
KNeighbors_model = KNeighborsClassifier(n_neighbors=10)
score_KN = cross_val_score(KNeighbors_model, X, y, cv=kfold)
score_KN.mean()

0.8337868480725623

# ***Feature Selection***

## ***Select KBest***

In [264]:
KBest = SelectKBest(score_func=f_classif, k=10)
fit = KBest.fit(X,y)
print(fit.get_feature_names_out())
featuresKBest = fit.transform(X)

['BusinessTravel_Travel_Frequently' 'Department_Human Resources'
 'EducationField_Human Resources' 'MaritalStatus_Divorced'
 'MaritalStatus_Married' 'MaritalStatus_Single' 'Age' 'TotalWorkingYears'
 'YearsAtCompany' 'YearsWithCurrManager']


## ***Select From Model***

In [265]:
def sel_variables(modelos,X,y, SelectFromModel,np,threshold):
    var_names_ac=np.array([])
    for modelo in modelos:
      try:
        modelo.fit(X,y)
        sel = SelectFromModel(modelo, prefit=True,threshold=threshold)
        var_names= sel.get_feature_names_out(modelo.feature_names_in_)
        var_names_ac=np.append(var_names_ac, var_names)
      except ValueError:
        pass
      var_names_ac=np.unique(var_names_ac)
    return var_names_ac

In [266]:
var_names = sel_variables([GaussianNB(), RandomForestClassifier(), SVC(), LinearSVC(),KNeighborsClassifier()], X, y, SelectFromModel, np,threshold='1.2*mean')

In [267]:
var_names

array(['Age', 'BusinessTravel_Non-Travel',
       'BusinessTravel_Travel_Frequently', 'DistanceFromHome',
       'Education', 'EducationField_Human Resources', 'JobLevel',
       'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'], dtype=object)

In [268]:
X2 = X[var_names]
X2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4410 entries, 0 to 4409
Data columns (total 16 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Age                               4410 non-null   float64
 1   BusinessTravel_Non-Travel         4410 non-null   int64  
 2   BusinessTravel_Travel_Frequently  4410 non-null   int64  
 3   DistanceFromHome                  4410 non-null   float64
 4   Education                         4410 non-null   float64
 5   EducationField_Human Resources    4410 non-null   int64  
 6   JobLevel                          4410 non-null   float64
 7   MonthlyIncome                     4410 non-null   float64
 8   NumCompaniesWorked                4410 non-null   float64
 9   PercentSalaryHike                 4410 non-null   float64
 10  StockOptionLevel                  4410 non-null   float64
 11  TotalWorkingYears                 4410 non-null   float64
 12  Traini

# ***Model selection***

In [269]:
def medir_modelos(modelos, X, y, cv, cross_val_score, pd):

    metric_modelos = pd.DataFrame()
    for modelo in modelos:
        scores = cross_val_score(modelo, X, y, cv=cv )
        pdscores = pd.DataFrame(scores)
        metric_modelos = pd.concat([metric_modelos, pdscores], axis=1)
    
    metric_modelos.columns = ['GaussianNB', 'RandomForestClassifier', 'SVC', 'LinearSVC', 'KNeighborsClassifier']
    return metric_modelos

In [270]:
score_X2 = medir_modelos([GaussianNB(), RandomForestClassifier(), SVC(), LinearSVC(),KNeighborsClassifier()], X2, y, kfold, cross_val_score, pd)
score_X = medir_modelos([GaussianNB(), RandomForestClassifier(), SVC(), LinearSVC(),KNeighborsClassifier()], X, y, kfold, cross_val_score, pd)

In [271]:
score = pd.concat([score_X2, score_X], axis=1)
score.columns = ['X2_GaussianNB', 'X2_RandomForestClassifier', 'X2_SVC', 'X2_LinearSVC', 'X2_KNeighborsClassifier', 'X_GaussianNB', 'X_RandomForestClassifier', 'X_SVC', 'X_LinearSVC', 'X_KNeighborsClassifier']

In [272]:
fig = go.Figure()
for column in score.columns:
    fig.add_trace(go.Box(y=score[column], name=column))
# fig.add_trace(go.Box(y=y1))
fig.update_layout(colorway=px.colors.sequential.Aggrnyl)
fig.show()