In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

sns.set_style('darkgrid')
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading the Data

In [None]:
train_data= pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
train_data.head()

In [None]:
train_data.tail()

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
train_data.isna().sum()

In [None]:
train_data.drop(['enrollee_id', 'city'], axis=1, inplace=True)
train_data

**Dividing the columns into categorical and numerical for ease in future**

In [None]:
num_cols= ['city_development_index' ,'training_hours']
cat_cols= train_data.drop(['city_development_index' ,'training_hours', 'target'], axis=1).columns

In [None]:
cat_cols

# Exploratory Data Analysis

In [None]:
import pandas_profiling
train_data.profile_report()

# Preprocessing the Data

In [None]:
train_data['company_size']= train_data['company_size'].replace('10/49', '10-49')

In [None]:
exp_counts= train_data['experience'].value_counts()
exp_counts

**Many values for experience so grouping all values under 600 to one category in order to prevent too many variables while encoding**

In [None]:
mask= train_data['experience'].isin(exp_counts[exp_counts<600].index)
train_data['experience'][mask]= 'other'
train_data['experience'].value_counts()

In [None]:
train_data.head()

# Encoding Categorical Variables

In [None]:
from sklearn.preprocessing import LabelEncoder

le= LabelEncoder()

for col in cat_cols:
    train_data[col]= le.fit_transform(train_data[col])


In [None]:
train_data

In [None]:
train_data.head()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(train_data.corr(), annot=True)

# Dealing with Missing Values

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

lr= LinearRegression()

imputer = IterativeImputer(random_state=42, estimator=lr, max_iter=10, n_nearest_features=2, imputation_order = 'roman')
train_data_final = imputer.fit_transform(train_data.drop('target', axis=1))

train_data_final = pd.DataFrame(train_data_final, columns = train_data.drop('target', axis=1).columns)



In [None]:
train_data_final.isna().sum()

In [None]:
train_data_final.head()

In [None]:
train_data['target']= train_data['target'].map(lambda x: 1 if x==1.0 else 0)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(train_data_final.corr(), annot=True)

# Final Checking the data after preprocessing

In [None]:
for col in cat_cols:
    plt.figure()
    sns.countplot(train_data_final[col])
        
        
plt.show()

In [None]:
for col in num_cols:
    plt.figure()
    sns.histplot(train_data_final[col])
    
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

X= train_data_final
y= train_data['target']
X_train, X_test,y_train,y_test= train_test_split(X,y, test_size=0.2, stratify=y,random_state=42)



In [None]:
X_train

In [None]:
y_train

**The numerical columns need to be standardized**

In [None]:
ss= StandardScaler()
X_train[num_cols]= ss.fit_transform(X_train[num_cols])
X_test[num_cols]= ss.transform(X_test[num_cols])

In [None]:
X_train

# Training the Data

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

In [None]:
key= ['LogisticRegression', 'DecisionTreeRegressor', 'DecisionTreeClassifier',  'RandomForestClassifier', 'KNeighborsClassifier', 'GaussianNB', 'XGBClassifier', 'SVC']

value= [LogisticRegression(), DecisionTreeRegressor() , DecisionTreeClassifier() ,  RandomForestClassifier() ,  KNeighborsClassifier(), GaussianNB() , XGBClassifier(), SVC()]
models= dict(zip(key, value))

In [None]:
models

In [None]:
scores=[]
for key,value in models.items():
    score= -1*cross_val_score(value, X,y, cv=5, scoring='neg_mean_absolute_error')
    scores.append(score)
    print(key, score.mean())

In [None]:
accuracy_scores=[]
for key,value in models.items():
    value.fit(X_train,y_train)
    y_pred= value.predict(X_test)
    accuracy= value.score(X_test,y_test)
    
    accuracy_scores.append(accuracy)
    print(key, accuracy)

**XGBClassifier and RandomForestClassifier are the best models**

In [None]:
rfc= RandomForestClassifier(random_state=42)
rfc.fit(X_train,y_train)
y_pred= rfc.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))


In [None]:
xgb= XGBClassifier(random_state=42)
xgb.fit(X_train,y_train)
y_pred= xgb.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

**XGBClassifier is the best model for this data**

In [None]:
params= {'objective':['binary:logistic'],
              'max_depth': [3,4,5,6],
              'min_child_weight': [1,5,10,12],
              'subsample': [0.6,0.8,1.0],
              'colsample_bytree': [0.6,0.8,1.0], 'gamma': [0.5,1,1.5,2]}

xgb= XGBClassifier(n_estimators=600)

# Tuning the Hyperparameters

In [None]:
grid= RandomizedSearchCV(xgb, cv=5, verbose=3, param_distributions=params, n_iter=5)

grid.fit(X,y)

In [None]:
grid.best_score_

In [None]:
grid.best_estimator_

In [None]:
grid.best_estimator_.fit(X_train,y_train)
y_pred= grid.best_estimator_.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Test Data

In [None]:
test_data= pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
test_data

In [None]:
test_data.drop(['enrollee_id', 'city'], axis=1, inplace=True)
test_data

In [None]:
mask= test_data['experience'].isin(exp_counts[exp_counts<600].index)
test_data['experience'][mask]= 'other'
test_data

In [None]:
for col in cat_cols:
    test_data[col]= le.fit_transform(test_data[col])
test_data

In [None]:
test_data_final = imputer.transform(test_data)

test_data_final = pd.DataFrame(test_data_final, columns = test_data.columns)

In [None]:
test_data_final[num_cols]= ss.transform(test_data_final[num_cols])

In [None]:
test_data_final

In [None]:
test_data_final.info()

In [None]:
predictions= grid.best_estimator_.predict(test_data_final)

# Final Predictions on Test Dataset

In [None]:
my_array = predictions
print(my_array)
np. set_printoptions(threshold=np. inf)
print(my_array)

