In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# import warnings
# warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,plot_roc_curve,accuracy_score
pd.set_option('display.max_columns',None)#to make all columns visible
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/employee-attrition/HR-Employee-Attrition.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isna().count()

In [None]:
df['Age'].value_counts()

In [None]:
df['Attrition']=np.where(df['Attrition']=='Yes',1,0)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df['YearsWithCurrManager'].nunique()

In [None]:
df['NumCompaniesWorked'].nunique()

In [None]:
df['JobSatisfaction'].value_counts()

In [None]:
df['DistanceFromHome']

In [None]:
df.groupby('Education').EducationField.value_counts()

In [None]:
df.groupby('Department').EmployeeCount.value_counts()

In [None]:
df['MonthlyRate'].value_counts()

In [None]:
df.groupby('Department').JobRole.value_counts()

In [None]:
df['MonthlyRate'].nunique()

In [None]:
df.head()

In [None]:
rates=df[['MonthlyRate','DailyRate','HourlyRate']]
rates.corr()

In [None]:
final_ds=df.drop(['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber'],axis=1)

In [None]:
final_ds.shape

In [None]:
df_category_columns = final_ds.select_dtypes(exclude=np.number).columns
df_category_columns

In [None]:
encoded_df  = pd.DataFrame(data = pd.get_dummies(df[df_category_columns], drop_first=False),index=final_ds.index)

In [None]:
encoded_df.shape

In [None]:
encoded_df.head()

In [None]:
final_ds.drop(df_category_columns,axis=1,inplace=True)
final_ds.head()

In [None]:
final_concat_df=pd.concat([final_ds,encoded_df],axis=1)
final_concat_df.shape

In [None]:
x=final_concat_df.drop('Attrition',axis=1)
y=final_concat_df['Attrition']

In [None]:
x.shape,y.shape

In [None]:
sc=StandardScaler()
sc.fit(x)

In [None]:
x_scaled=pd.DataFrame(data=sc.transform(x),columns=x.columns)
x_scaled.head()

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=123)

In [None]:
y_train.value_counts()

In [None]:
def metrics(y_test,y_pred):
    print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
    print('\n\nAccuracy Score:\n', accuracy_score(y_test, y_pred))
    print('\n\nClassification Report: \n', classification_report(y_test, y_pred))

In [None]:
lg=LogisticRegression(max_iter=10000)


In [None]:
lg.fit(x_train,y_train)

In [None]:
train_pred=lg.predict(x_train)
test_pred=lg.predict(x_test)

In [None]:
metrics(y_train,train_pred)

In [None]:
knn=KNeighborsClassifier(n_neighbors=5,n_jobs=-1)
knn.fit(x_train,y_train)

In [None]:
knn_train_pred=knn.predict(x_train)
knn_test_pred=knn.predict(x_test)

In [None]:
metrics(y_train,knn_train_pred)

In [None]:
metrics(y_test,knn_test_pred)

In [None]:
NB=GaussianNB()
NB.fit(x_train,y_train)

In [None]:
train_pred_nb=NB.predict(x_train)
test_pred_nb=NB.predict(x_test)

In [None]:
metrics(y_train,train_pred_nb)

In [None]:
metrics(y_test,test_pred_nb)

In [None]:
error_rate=[]

# Will take some time
for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i, n_jobs=-1)
    knn.fit(x_train,y_train)
    pred_i = knn.predict(x_test)
    error_rate.append(np.mean(pred_i != y_test))


In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
knn = KNeighborsClassifier(n_neighbors=10,n_jobs=-1)

knn.fit(x_train, y_train)

In [None]:
train_pred_knn = knn.predict(x_train)

test_pred_knn = knn.predict(x_test)

In [None]:
metrics(y_train,train_pred_knn)

In [None]:
metrics(y_test,test_pred_knn)