In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#plotly
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.validators.scatter.marker import SymbolValidator

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
        
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
test = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_test.csv")

In [None]:
fig, axarr = plt.subplots(1,2, figsize=(20, 7))
msno.bar(train, ax=axarr[0])
msno.bar(test, ax=axarr[1])
fig.show()

In [None]:
train[(train.education_level == 'Primary School') |(train.education_level == 'High School')].head() # this rows has no major_discipline, so i want to change it as Ps_Hs

In [None]:
for i in train[(train.education_level == 'Primary School') |(train.education_level == 'High School')].index:
    train.loc[i, "major_discipline"] = "Ps_Hs"

for i in test[(test.education_level == 'Primary School') |(test.education_level == 'High School')].index:
    test.loc[i, "major_discipline"] = "Ps_Hs"

In [None]:
train = train.drop(['enrollee_id','city'],axis=1)
test = test.drop(['enrollee_id','city'],axis=1)

In [None]:
train.gender = train.gender.fillna("Not_Answer")
test.gender = test.gender.fillna("Not_Answer")

In [None]:
train.enrolled_university.value_counts() # almost is no_enrollment, so I fill no_-enrollment

In [None]:
train.enrolled_university = train.enrolled_university.fillna("no_enrollment")
test.enrolled_university = test.enrolled_university.fillna("no_enrollment")

In [None]:
train.education_level.value_counts()  # almost is Graduate, so I fill no_-Graduate

In [None]:
train.education_level = train.education_level.fillna("Graduate")
test.education_level = test.education_level.fillna("Graduate")

In [None]:
train.major_discipline.value_counts() # almost is STEM, so I fill no_-STEM

In [None]:
train.major_discipline = train.major_discipline.fillna("STEM")
test.major_discipline = test.major_discipline.fillna("STEM")

In [None]:
train.experience.value_counts()

In [None]:
def make_int(experience):
    if experience == '>20':
        return 21
    elif experience == '<1':
        return 0
    else:
        return experience

train.experience = train.experience.map(make_int)
test.experience = test.experience.map(make_int)

    
a = train[~train.experience.isna()].experience
b = test[~test.experience.isna()].experience

a = a.astype(int)
b = b.astype(int)

sns.displot(a)

In [None]:
a.mean(), b.mean(), a.median(), b.median() # input median 9

In [None]:
train.experience = train.experience.fillna(9)
test.experience = test.experience.fillna(9)

In [None]:
train.company_size = train.company_size.fillna("Not_Answer")
test.company_size = test.company_size.fillna("Not_Answer")

In [None]:
train.company_type = train.company_type.fillna("Not_Answer")
test.company_type = test.company_type.fillna("Not_Answer")

In [None]:
train.last_new_job.value_counts()

In [None]:
def make_int2(last):
    if last == '>4':
        return 5
    elif last == 'never':
        return 0
    else:
        return last
    
train.last_new_job = train.last_new_job.map(make_int2)
test.last_new_job = test.last_new_job.map(make_int2)
    
a = train[~train.last_new_job.isna()].last_new_job
b = test[~test.last_new_job.isna()].last_new_job

a = a.astype(int)
b = b.astype(int)

sns.displot(a)

In [None]:
a.mean(), a.median(), b.mean(), b.median() # put 1

In [None]:
train.last_new_job = train.experience.fillna(1)
test.last_new_job = test.experience.fillna(1)

In [None]:
train = pd.get_dummies(train, columns=["gender","enrolled_university","relevent_experience","education_level","major_discipline","company_size","company_type"],drop_first=True)
test = pd.get_dummies(test, columns=["gender","enrolled_university","relevent_experience","education_level","major_discipline","company_size","company_type"],drop_first=True)

In [None]:
X = train.drop("target",axis=1)
y = train["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=.3, random_state=100)

In [None]:
transformer = RobustScaler().fit(X_train)
X_train = pd.DataFrame(transformer.transform(X_train),columns=X_train.columns)

transformer = RobustScaler().fit(X_test)
X_test = pd.DataFrame(transformer.transform(X_test),columns=X_test.columns)

transformer = RobustScaler().fit(test)
test = pd.DataFrame(transformer.transform(test),columns=test.columns)

In [None]:
X_train.describe()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
error_list = []
for i in range(1,31):
    model = RandomForestClassifier(max_depth = i,random_state=100)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    error_list.append(accuracy_score(y_test, pred))

In [None]:
error_list

In [None]:
error_list.index(max(error_list))

In [None]:
model = RandomForestClassifier(max_depth = 12,random_state=100)
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(accuracy_score(y_test, pred))

In [None]:
confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test, pred))

In [None]:
pred = model.predict(test)
result = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/sample_submission.csv")

In [None]:
result.target = pred

In [None]:
result.target.value_counts() # most people get 0....