<br>
<h1 style = "font-size:40px; font-family:Garamond ; font-weight : normal; background-color: #C66363 ; color : #E8D6D8; text-align: center; border-radius: 100px 100px;">INTRODUCTION </h1>
<br>

* #### [Add Libaries](#1)
* #### [Load and Examine Data](#2)
    * ##### [Examine Data](#21)
    * ##### [Visualize Data](#22)
* #### [Preprocess Data](#3)
    * ##### [Dropping Outliers](#31)
    * ##### [Concatenating Train and Test Data](#32)
    * ##### [Fill Missing Values](#34)
* #### [Feature Engineering](#4)
    * ##### [Changing The Distribution Of Categories](#41)
    * ##### [Dummy Encoding](#43)
* #### [Make Models](#5) 
    * ##### [Separating Test And Train Datas](#51)



<a id="1"> </a>
# Add Libaries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
import statsmodels.api as sm
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



<a id="2"> </a>
# Load and Examine Data

<a id="21"> </a>
## Examine Data

In [None]:
train = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
test = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_test.csv")

In [None]:
print(f"train shape : {train.shape} , test shape : {test.shape} ")

In [None]:
train.info() 

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.isnull().sum()

<a id="22"> </a>
## Visualize Data

In [None]:
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

f, ax = plt.subplots(figsize=(10, 8))

cmap = sns.color_palette("ch:s=-.2,r=.3", as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.1, cbar_kws={"shrink": .3})

In [None]:
f, ax = plt.subplots(figsize=(25, 10))
ax.tick_params(labelsize=15)
sns.scatterplot(data=train, x="city_development_index", y="training_hours", hue="education_level", sizes=(50, 500), palette="Paired")
f.subplots_adjust(top=0.9)
f.suptitle('SalePrice compared to YearBuilt, MSSubClass and TotalBsmtSF', fontsize="28");


In [None]:
fig = plt.figure(figsize=(25,12))

gs = fig.add_gridspec(2,3)

ax00 = fig.add_subplot(gs[0,0])
ax01 = fig.add_subplot(gs[0,1])
ax02 = fig.add_subplot(gs[0, 2])
ax10 = fig.add_subplot(gs[1,0])
ax11 = fig.add_subplot(gs[1,1])
ax12 = fig.add_subplot(gs[1,2])


ax00.set_title('gender', fontsize=20)
ax01.set_title('relevent_experience', fontsize=20)
ax02.set_title('enrolled_university', fontsize=20)
ax10.set_title('education_level', fontsize=20)
ax11.set_title('major_discipline', fontsize=20)
ax12.set_title('last_new_job', fontsize=20)


ax00.tick_params(labelsize=12)
ax01.tick_params(labelsize=12)
ax02.tick_params(labelsize=12)
ax10.tick_params(labelsize=12)
ax11.tick_params(labelsize=12)
ax12.tick_params(labelsize=12)


sns.histplot(data = train,x="target", kde=False, ax =ax00, hue="gender", palette="Set3", multiple="stack")
sns.histplot(data = train,x="target", kde=False, ax =ax01, hue="relevent_experience", palette="Set3", multiple="stack")
sns.histplot(data = train,x="target", kde=False, ax =ax02, hue="enrolled_university", palette="Set3", multiple="stack")
sns.histplot(data = train,x="target", kde=False, ax =ax10, hue="education_level", palette="Set3", multiple="stack")
sns.histplot(data = train,x="target", kde=False, ax =ax11, hue="major_discipline", palette="Set3", multiple="stack")
sns.histplot(data = train,x="target", kde=False, ax =ax12, hue="last_new_job", palette="Set3", multiple="stack")

fig.subplots_adjust(top=0.92)
fig.suptitle('Features vs Target', fontsize="28");

In [None]:
train['company_size'].value_counts()

<a id="3"> </a>
# Preprocess Data

<a id="32"> </a>
## Concatenating Train and Test Data

In [None]:
print(f"train shape : {train.shape} , test shape : {test.shape} ")
train_len = len(train)
train = pd.concat([train,test],axis = 0).reset_index(drop = True)
print(f"concatenate shape : {train.shape}")

<a id="34"> </a>
## Fill Missing Values

In [None]:
def find_missing_value(data):
    nulls = data.isnull().sum()

    for index,item in nulls.items():
        if item>0:
            print(f"Index : {index}, Value : {item}")  

In [None]:
find_missing_value(train)

In [None]:
x = train["company_size"]

ax = sns.countplot(x=x, data=x)

In [None]:
x = train[(train["last_new_job"]==">4")]["company_size"]

ax = sns.countplot(x=x, data=x)


In [None]:
index_nan_education_level = list(train["education_level"][train["education_level"].isnull()].index)
for i in index_nan_education_level:
   
    if (type(train["major_discipline"].iloc[i])=="str") & (train["enrolled_university"].iloc[i] == "no_enrollment") | (type(train["enrolled_university"].iloc[i])=="str"):
        train["education_level"].iloc[i] = "High School"
       
    else :
        train["education_level"].iloc[i] = "Graduate"
 

In [None]:
index_nan_last_new_job = list(train["last_new_job"][train["last_new_job"].isnull()].index)
for i in index_nan_last_new_job:
   
    if (train["education_level"].iloc[i] == "Masters") :
        train["last_new_job"].iloc[i] = "1"
       
    elif (train["education_level"].iloc[i] == "Graduate") :
        train["last_new_job"].iloc[i] = "1"
        
    elif (train["education_level"].iloc[i] == "High School") :
        train["last_new_job"].iloc[i] = "never"
    
    elif (train["education_level"].iloc[i] == "Phd") :
        train["last_new_job"].iloc[i] = ">4"
        
    else :
        train["last_new_job"].iloc[i] = "never"

In [None]:
index_nan_major_discipline = list(train["major_discipline"][train["major_discipline"].isnull()].index)
for i in index_nan_major_discipline:
   
    if (train["education_level"].iloc[i] == "High School") :
        train["major_discipline"].iloc[i] = "N"
       
    elif (train["education_level"].iloc[i] == "Primary School") :
        train["major_discipline"].iloc[i] = "N"
        
    else :
        train["major_discipline"].iloc[i] = "STEM"

In [None]:
index_nan_enrolled_university = list(train["enrolled_university"][train["enrolled_university"].isnull()].index)
for i in index_nan_enrolled_university:
   
    if (train["education_level"].iloc[i] == "High School") :
        train["enrolled_university"].iloc[i] = "Full time course"
          
    else :
        train["enrolled_university"].iloc[i] = "no_enrollment"

In [None]:
index_nan_company_size = list(train["company_size"][train["company_size"].isnull()].index)
for i in index_nan_company_size:
   
    if (train["last_new_job"].iloc[i] == "1" or "never") :
        train["company_size"].iloc[i] = "5000-9999"
       
    elif (train["last_new_job"].iloc[i] == "2" or "3") :
        train["company_size"].iloc[i] = "50-99"
    
    elif (train["last_new_job"].iloc[i] == "4") :
        train["company_size"].iloc[i] = "100-500"
        
    else :
        train["company_size"].iloc[i] = "10000+"


In [None]:
train["experience"] = train["experience"].fillna(">20")
train["gender"] = train["gender"].fillna("Male")

In [None]:
train=train.drop(['company_type'], axis=1)
train=train.drop(['city_development_index'], axis=1)
train=train.drop(['enrollee_id'], axis=1)

<a id="4"> </a>
# Feature Engineering

<a id="43"> </a>
## Dummy Encoding

In [None]:
def get_cat_idx():
    cat_column_index=[]
    for i in range(len(train.columns)):
        if(train.iloc[:,i].dtype=="O"):
            cat_column_index.append(i)
    return cat_column_index    

In [None]:
train_cat_idx = get_cat_idx() 

In [None]:
col_cat_name=[]
for i in train_cat_idx:
    colname = train.columns[i]
    col_cat_name.append(colname)

In [None]:
train

In [None]:
for f in col_cat_name:
    train = pd.get_dummies(train, columns= [f])

<a id="5"> </a>
# Make Models

<a id="51"> </a>
## Separating Test And Train Datas

In [None]:
X_test = train[train_len:]
train = train[:train_len]

X_test.drop(labels = ["target"],axis = 1, inplace = True)

In [None]:
print(f"train shape : {train.shape} , test shape : {X_test.shape} ")

In [None]:
X_train = train.drop(labels = "target", axis = 1)
#y_train = train["target"].astype("uint8")
#y_train=(np.array(y_train)).astype("str")
y_train = train["target"].astype("category")

In [None]:
#X_train = X_train.to_numpy()
#X_test = test.to_numpy()
#y_train = (np.array(y_train)).astype("str")

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
random_state = 42
classifier = [DecisionTreeClassifier(random_state = random_state),
             SVC(random_state = random_state),
             RandomForestClassifier(random_state = random_state),
             LogisticRegression(random_state = random_state),
             KNeighborsClassifier()]

dt_param_grid = {"min_samples_split" : range(10,500,20),
                "max_depth": range(1,20,2)}

svc_param_grid = {"kernel" : ["rbf"],
                 "gamma": [0.05, 0.25, 1],
                 "C": [5,50,150,350]}

rf_param_grid = {"max_features": [1,3,10],
                "min_samples_split":[2,3,10],
                "min_samples_leaf":[1,3,10],
                "bootstrap":[False],
                "n_estimators":[100,300],
                "criterion":["gini"]}

knn_param_grid = {"n_neighbors": np.linspace(1,19,10, dtype = int).tolist(),
                 "weights": ["uniform","distance"],
                 "metric":["euclidean","manhattan"]}
classifier_param = [dt_param_grid,
                   svc_param_grid,
                   rf_param_grid,
                   knn_param_grid]

In [None]:
cv_result = []
best_estimators = []
for i in range(len(classifier)):
    clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 5), scoring = "accuracy", n_jobs = -1,verbose = 1)
    clf.fit(X_train,y_train)
    cv_result.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print(cv_result[i])

In [None]:
cv_results = pd.DataFrame({"Cross Validation Means":cv_result, "ML Models":["DecisionTreeClassifier", "SVM","RandomForestClassifier",
             "KNeighborsClassifier"]})

g = sns.barplot("Cross Validation Means", "ML Models", data = cv_results)
g.set_xlabel("Mean Accuracy")
g.set_title("Cross Validation Scores")

In [None]:
votingC = VotingClassifier(estimators = [("dt",best_estimators[0]),
                                        ("rfc",best_estimators[2])],
                                        voting = "soft", n_jobs = -1)
votingC = votingC.fit(X_train, y_train)
print(accuracy_score(votingC.predict(X_val),y_val))