In [None]:
from sklearn.datasets import make_blobs

from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd

In [None]:
data_train = pd.read_csv("./train.csv")
data_test = pd.read_csv("./test.csv")
print("Shape of Training Data:", data_train.shape)
print("Shape of Test Data:", data_test.shape)
test_EmpId = {}
test_EmpId = data_test["Employee ID"]
data_train.head()

In [None]:
data_test.head()

In [None]:
data_train['HappyCond'] = np.where(data_train["Happiness"]>=0.5,1,0)
data_train.head()

In [None]:
print(data_train.info())
print(data_test.info())

In [None]:
import datetime as dt

data_train["Date of Joining"] =  pd.to_datetime(data_train["Date of Joining"])
data_test["Date of Joining"] =  pd.to_datetime(data_test["Date of Joining"])
data_train.dtypes

In [None]:
dt_today = dt.date.today()
data_train["today"]=dt_today
data_test["today"]=dt_today
data_train.head()

In [None]:
data_train["today"] =  pd.to_datetime(data_train["today"])
data_test["today"] =  pd.to_datetime(data_test["today"])

In [None]:
data_train['tenure'] = data_train['today'] - data_train["Date of Joining"]
data_test['tenure'] = data_test['today'] - data_test["Date of Joining"]
data_train.head()

In [None]:
data_train.dtypes

In [None]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

data_train['tenure'] = data_train['tenure'].astype(int)
data_test['tenure'] = data_test['tenure'].astype(int)
data_train.dtypes

In [None]:
data_test.dtypes

In [None]:
data_train = data_train.drop(["Employee ID", "Date of Joining", "today"], axis=1)
data_test = data_test.drop(["Employee ID", "Date of Joining", "today"], axis=1)
print(data_train.info())
print(data_test.info())

In [None]:
# Drop rows where target variable "Happiness" is missing.
data_train = data_train.dropna(subset=['Happiness'])
data_train.info()

In [None]:
# Impute remaining missing values with medians
df_train = data_train
for col in ['Resource Allocation', 'Mental Fatigue Score']:
    df_train[col] = df_train[col].fillna(df_train[col].median())

df_train.info()
df_train.describe()

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
sns.set(color_codes=True)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(10,5))
sns.countplot(df_train["Gender"], palette="winter", ax=ax[0])
sns.countplot(df_train["Company Type"], palette="winter", ax=ax[1])
sns.countplot(df_train["WFH Setup Available"], palette="winter", ax=ax[2])
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(14,5))
sns.boxenplot(x="Gender", y="Happiness", hue="Company Type", data=df_train, palette="winter", linewidth=0.0, ax=ax[0])
sns.boxenplot(x="Gender", y="Happiness", hue="WFH Setup Available", data=df_train, palette="winter", linewidth=0.0, ax=ax[1])
sns.boxenplot(x="WFH Setup Available", y="Happiness", hue="Company Type", data=df_train, palette="winter", linewidth=0.0, ax=ax[2])
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(14,5))
sns.barplot(x="Gender", y="HappyCond", hue="Company Type", data=df_train, palette="winter", linewidth=0.0, ax=ax[0])
sns.barplot(x="Gender", y="HappyCond", hue="WFH Setup Available", data=df_train, palette="winter", linewidth=0.0, ax=ax[1])
sns.barplot(x="WFH Setup Available", y="HappyCond", hue="Company Type", data=df_train, palette="winter", linewidth=0.0, ax=ax[2])
plt.tight_layout()
plt.show()

In [None]:
cont = ["Happiness", "Resource Allocation", "Mental Fatigue Score", "tenure"]
df_cont = df_train[cont]

In [None]:
k = 4
fig = plt.figure(figsize=(14,5))
# Correlations between each variable
corrmat = df_cont.corr()
# Take k elements in descending order of coefficient
cols = corrmat.nlargest(k, "Happiness")["Happiness"].index

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(14,5))
sns.scatterplot(x="Mental Fatigue Score", y="Happiness", hue="Gender", data=df_train, linewidth=0.0, ax=ax[0])
sns.scatterplot(x="Resource Allocation", y="Happiness", hue="Gender", data=df_train, linewidth=0.0, ax=ax[1])
sns.scatterplot(x="tenure", y="Happiness", hue="Gender", data=df_train, linewidth=0.0, ax=ax[2])
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(14,5))
sns.scatterplot(x="Mental Fatigue Score", y="Happiness", hue="WFH Setup Available", data=df_train, linewidth=0.0, ax=ax[0])
sns.scatterplot(x="Resource Allocation", y="Happiness", hue="WFH Setup Available", data=df_train, linewidth=0.0, ax=ax[1])
sns.scatterplot(x="tenure", y="Happiness", hue="WFH Setup Available", data=df_train, linewidth=0.0, ax=ax[2])
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(10,5))
sns.boxenplot(x="Gender", y="Mental Fatigue Score", data=df_train, palette="winter", linewidth=0.0, ax=ax[0])
sns.boxenplot(x="Company Type", y="Mental Fatigue Score", data=df_train, palette="winter", linewidth=0.0, ax=ax[1])
sns.boxenplot(x="WFH Setup Available", y="Mental Fatigue Score", data=df_train, palette="winter", linewidth=0.0, ax=ax[2])
plt.tight_layout()
plt.show()

In [None]:
list_cols = ['Gender','Company Type','WFH Setup Available']

ce_ohe = ce.OneHotEncoder(cols=list_cols)
df_train = ce_ohe.fit_transform(df_train)
df_train.head()

ce_ohe2 = ce.OneHotEncoder(cols=list_cols)
data_test = ce_ohe2.fit_transform(data_test)

In [None]:
X = df_train.drop(['Happiness','Gender_2','Company Type_2','WFH Setup Available_2','HappyCond'], axis=1)
y = df_train['HappyCond']

In [None]:
data_test = data_test.drop(['Gender_2','Company Type_2','WFH Setup Available_2'], axis=1)

In [None]:
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,test_size = 0.3, random_state=100)

In [None]:
cols = X.columns
X_test.dtypes

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

In [None]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [None]:
X_train.columns = cols
X_test.columns = cols

In [None]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB

# instantiate the model
gnb = GaussianNB()

# fit the model
gnb.fit(X_train, y_train)
y_pred= gnb.predict(X_test)
gnb_pred=gnb.predict_proba(X_test)
gnb_pred

In [None]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
df = pd.DataFrame(gnb_pred)
df

In [None]:
c = 0
happinessLevel = {}
for index, row in df.iterrows():
    if row[0]> row[1]:
        happinessLevel[index]= "Unhappy"
    else:
        happinessLevel[index]= "Happy"
        c +=1
        
    print("Employee ID - " +test_EmpId[index]+" - " + happinessLevel[index])
print (c)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

print('Confusion matrix\n\n', cm)

print('\nTrue Negative(TN) = ', cm[0,0])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negative(FN) = ', cm[1,0])

print('\nTrue Positive(TP) = ', cm[1,1])


In [None]:
y_test.value_counts()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=False, figsize=(10,5))
sns.countplot(df_train["HappyCond"], palette="winter", ax=ax[0]).set_title("Data for Training Model")
sns.countplot(y_test, palette="winter", ax=ax[1]).set_title("Data for Testing Model")
plt.tight_layout()
plt.show()
print("0 : Not Happy")
print("1 : Happy")

In [None]:
values = y_test.value_counts().keys().tolist()
counts = y_test.value_counts().tolist()
for i in range(len(values)):
    if values[i] == 0:
        print("Number of 'NOT HAPPY' Employees are: "+str(counts[i]))
    else:
        print("Number of 'HAPPY' Employees are: "+str(counts[i]))