In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/hr-analytics/HR_comma_sep.csv')
df.head()

In [None]:
df.shape

In [None]:
df.describe().T

# EDA & Data analysic

In [None]:
profile = ProfileReport(df)
profile

In [None]:
df.info()

In [None]:
df['Department'].unique()

In [None]:
sns.countplot(df['left'],palette='Set3')

In [None]:
sns.set_style('whitegrid')
sns.set_palette('Set3')
df.hist(figsize=(18,10));

In [None]:
sns.countplot(df['salary'],hue=df['left'],palette='Set2')

In [None]:
sns.countplot(df['Department'],hue=df['left'],palette='Set2')

In [None]:
sns.countplot(df['number_project'],hue=df['left'],palette='Set2')

In [None]:
pd.crosstab(df['satisfaction_level'],df['left']).plot(kind="bar",figsize=(25,8),color=['green','brown' ])
plt.title('Personnel satisfaction')
plt.xlabel('satisfaction_level')
plt.ylabel('Frequency')

In [None]:
pd.crosstab(df['last_evaluation'],df['left']).plot(kind="bar",figsize=(25,8),color=['blue','red' ])
plt.title('lastevaluation of left')
plt.xlabel('last_evaluation')
plt.ylabel('Frequency')

In [None]:
df['salary'].value_counts()

In [None]:
df['Department'].value_counts()

In [None]:
table = df.pivot_table("satisfaction_level", index="Department", columns="salary")
table

In [None]:
sns.pairplot(df,hue= 'left',palette='Set2')

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot=True)

# missing value

In [None]:
df.isnull().mean()

In [None]:
## null count analysis
import missingno as msno
p=msno.bar(df)

# Outlier detection

In [None]:
df.info()

In [None]:
def plot(df,col):
    fig,(ax1,ax2)=plt.subplots(2,1)
    sns.distplot(df[col],ax=ax1)
    sns.boxplot(df[col],ax=ax2,color='skyblue')

In [None]:
plot(df,"satisfaction_level")

In [None]:
plot(df,"last_evaluation")

In [None]:
plot(df,"number_project")

In [None]:
plot(df,"average_montly_hours")

In [None]:
plot(df,"time_spend_company")

# Duplicate

In [None]:
print('Before drop duplicate:', df.shape)



df= df.drop_duplicates()
print('After drop duplicate:', df.shape)


# Data encoding

In [None]:
from sklearn.preprocessing import LabelEncoder 


le=LabelEncoder()
df['salary']=le.fit_transform(df['salary']) 


In [None]:
df = pd.get_dummies(df,drop_first=True)
df.head()

# Train Test

In [None]:
x= df.drop("left", axis=1)
y = df.left

In [None]:
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=.80,random_state=22)
print(xtrain.shape)
print(xtest.shape)

## Data scailng

In [None]:
from sklearn.preprocessing import MinMaxScaler

scale = MinMaxScaler()
xtrain = scale.fit_transform(xtrain, ytrain)
xtest = scale.transform(xtest)

## balancing

In [None]:
from imblearn.over_sampling import SMOTE  
 


smt = SMOTE()
xtrain, ytrain = smt.fit_resample(xtrain, ytrain)
np.bincount(ytrain)

## the model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier 
import xgboost as xgb 
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection


In [None]:
# k ploting
k_range = list(range(1,10))
score=[]

for k in k_range:
    knn= KNeighborsClassifier(n_neighbors=k)
    knn.fit(xtrain,ytrain)
    ypred = knn.predict(xtest)
    score.append(accuracy_score(ytest, ypred))
    

plt.plot(k_range,score)


plt.xlabel('Value of k for KNN')
plt.ylabel('Accuracy Score')

plt.title('Accuracy Scores for Values of k of k-Nearest-Neighbors') 

In [None]:
sns.set_palette('Set2')

test_scores = []
train_scores = []

for i in range(1,30):

    knn = KNeighborsClassifier(i)
    knn.fit(xtrain,ytrain)
    
    train_scores.append(knn.score(xtrain,ytrain))
    test_scores.append(knn.score(xtest,ytest))

plt.figure(figsize=(20,5))
p = sns.lineplot(range(1,30),train_scores,marker='*',label='Train Score')
p = sns.lineplot(range(1,30),test_scores,marker='o',label='Test Score')

In [None]:
#LogisticRegression
lr_c=LogisticRegression(C=0.01, penalty= 'l2',random_state=22)
lr_c.fit(xtrain,ytrain)
lr_pred=lr_c.predict(xtest)
lr_cm=confusion_matrix(ytest,lr_pred)
lr_ac=accuracy_score(ytest, lr_pred)

#MLP
MLP = MLPClassifier(activation='relu', hidden_layer_sizes= (20, 30), learning_rate_init= 0.001, max_iter=200, solver ='adam',random_state=22)
MLP.fit(xtrain,ytrain)
MLP_pred=MLP.predict(xtest)
MLP_cm=confusion_matrix(ytest,MLP_pred)
MLP_ac=accuracy_score(ytest, MLP_pred)

#Bayes
gaussian=GaussianNB()
gaussian.fit(xtrain,ytrain)
bayes_pred=gaussian.predict(xtest)
bayes_cm=confusion_matrix(ytest,bayes_pred)
bayes_ac=accuracy_score(bayes_pred,ytest)

#SVM  
svc_r=SVC(C= 100, kernel= 'poly',degree=2,random_state=22)
svc_r.fit(xtrain,ytrain)
svr_pred=svc_r.predict(xtest)
svr_cm=confusion_matrix(ytest,svr_pred)
svr_ac=accuracy_score(ytest, svr_pred)

#RandomForest
rdf_c=RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=22)
rdf_c.fit(xtrain,ytrain)
rdf_pred=rdf_c.predict(xtest)
rdf_cm=confusion_matrix(ytest,rdf_pred)
rdf_ac=accuracy_score(rdf_pred,ytest)

# DecisionTree Classifier
dtree_c=DecisionTreeClassifier(criterion= 'gini', max_depth= 5, min_samples_leaf= 2, min_samples_split= 3, splitter= 'best',random_state=22)
dtree_c.fit(xtrain,ytrain)
dtree_pred=dtree_c.predict(xtest)
dtree_cm=confusion_matrix(ytest,dtree_pred)
dtree_ac=accuracy_score(dtree_pred,ytest)
#KNN
knn=KNeighborsClassifier(n_neighbors=2)
knn.fit(xtrain,ytrain)
knn_pred=knn.predict(xtest)
knn_cm=confusion_matrix(ytest,knn_pred)
knn_ac=accuracy_score(knn_pred,ytest)

In [None]:
plt.figure(figsize=(20,10))

plt.subplot(2,4,1)
plt.title("LogisticRegression_cm")
sns.heatmap(lr_cm,annot=True,cmap="Blues",fmt="d",cbar=False)

plt.subplot(2,4,2)
plt.title("MLP")
sns.heatmap(MLP_cm,annot=True,cmap="Blues",fmt="d",cbar=False)

plt.subplot(2,4,3)
plt.title("bayes_cm")
sns.heatmap(bayes_cm,annot=True,cmap="Oranges",fmt="d",cbar=False)

plt.subplot(2,4,4)
plt.title("RandomForest")
sns.heatmap(rdf_cm,annot=True,cmap="Blues",fmt="d",cbar=False)

plt.subplot(2,4,5)
plt.title("SVM")
sns.heatmap(svr_cm,annot=True,cmap="Reds",fmt="d",cbar=False)

plt.subplot(2,4,6)
plt.title("DecisionTree_cm")
sns.heatmap(dtree_cm,annot=True,cmap="Blues",fmt="d",cbar=False)

plt.subplot(2,4,7)
plt.title("kNN_cm")
sns.heatmap(knn_cm,annot=True,cmap="Blues",fmt="d",cbar=False)

In [None]:
print('LogisticRegression_accuracy:\t',lr_ac)
print('MLP_accuracy:\t\t\t',MLP_ac)
print('RandomForest_accuracy:\t\t',rdf_ac)
print('DecisionTree_accuracy:\t\t',dtree_ac)
print('KNN_accuracy:\t\t\t',knn_ac)
print('SVM_accuracy:\t\t\t',svr_ac)
print('Bayes_accuracy:\t\t\t',bayes_ac)

In [None]:
models = pd.DataFrame({'Model': ['LogisticRegression','MLP','Bayes','SVM',
                                      'RandomForest','DecisionTree_Classifier','KNN'],'Score': [lr_ac,MLP_ac,bayes_ac,svr_ac,rdf_ac,dtree_ac,knn_ac]})

models.sort_values(by = 'Score', ascending = False).reset_index(drop=True)

In [None]:
colors = ["purple", "green", "orange", "magenta","blue","black","red"]

sns.set_style("whitegrid")
plt.figure(figsize=(20,8))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=models['Score'],y=models['Model'], palette=colors )


In [None]:
DT2= DecisionTreeClassifier(criterion= 'gini', max_depth= 5, min_samples_leaf= 2, min_samples_split= 3, splitter= 'best',random_state=22)

scores = cross_val_score(DT2, x , y , cv = 10, scoring = 'accuracy')
print(scores)
print(scores.mean())


In [None]:
lr_c2=LogisticRegression()
scores_Lr = cross_val_score(lr_c2, x , y , cv = 10, scoring = 'accuracy')
print(scores_Lr)
print(scores_Lr.mean())

In [None]:
mlp2=MLPClassifier(activation='relu', hidden_layer_sizes= (20, 30), learning_rate_init= 0.001, max_iter=200, solver ='adam',random_state=22)
scores_mlp = cross_val_score(mlp2, x , y , cv = 10, scoring = 'accuracy')
print(scores_mlp)
print(scores_mlp.mean())

In [None]:
gb_2 = GaussianNB()
scores_gb =cross_val_score(gb_2, x , y , cv = 10, scoring = 'accuracy')
print(scores_gb)
print(scores_gb.mean())

In [None]:
svc_2 = SVC(C= 100, kernel= 'poly',degree=2,random_state=22)
scores_svc = cross_val_score(svc_2,x,y,cv=10,scoring='accuracy')

print(scores_svc)
print(scores_svc.mean())

In [None]:
rdf_2 = RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=22)
scores_rd = cross_val_score(rdf_2,x,y,cv=10,scoring='accuracy')

print(scores_rd)
print(scores_rd.mean())

In [None]:
knn_2 = KNeighborsClassifier(n_neighbors=2)
scores_kn = cross_val_score(knn_2,x,y,cv = 10,scoring='accuracy')

print(scores_kn)
print(scores_kn.mean())

In [None]:
model = ['KNN','Randomforest','DesitionTree','NeuralNetwork','SVM','LogisticRegression','NaiveBayes']
treintest = [knn_ac,rdf_ac,dtree_ac,MLP_ac,svr_ac,lr_ac,bayes_ac]
cross = [scores_kn.mean(),scores_rd.mean(),scores.mean(),scores_mlp.mean(),scores_svc.mean(),scores_Lr.mean(),scores_gb.mean()]

In [None]:
Comparison = pd.DataFrame({'Model':model,'TreinTest Accuracy':treintest,'Cross Val Accuracy':cross})
Comparison.sort_values(by='TreinTest Accuracy',ascending=False,inplace=True)

In [None]:
Comparison.reset_index(drop=True)

## voting

In [None]:
kfold = model_selection.KFold(n_splits=10, random_state=22)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))
model4 = xgb.XGBClassifier()
estimators.append(('xgboost', model4))
model5 = KNeighborsClassifier()
estimators.append(('Knn', model5))
model6= RandomForestClassifier()
estimators.append(('Random forest', model6))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, xtrain, ytrain, cv=kfold)
print(results.mean())