In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
data = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
val_data = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')

In [None]:
data.shape , val_data.shape

In [None]:
data.head()

In [None]:
val_data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
cat_columns = [column for column in data.columns if data[column].dtype=='O']
cat_columns

In [None]:
for column in cat_columns:
    print(data[column].unique())

In [None]:
data.shape

In [None]:
cat_columns = [column for column in data.columns if data[column].dtype=='O']

In [None]:
cat_columns

In [None]:
for column in cat_columns:
    print(data[column].unique())

In [None]:

gender_map = {
        'Female': 2,
        'Male': 1,
        'Other': 0
         }

relevent_experience_map = {
    'Has relevent experience':  1,
    'No relevent experience':    0
}

enrolled_university_map = {
    'no_enrollment'   :  0,
    'Full time course':    1, 
    'Part time course':    2 
}
    
education_level_map = {
    'Primary School' :    0,
    'Graduate'       :    2,
    'Masters'        :    3, 
    'High School'    :    1, 
    'Phd'            :    4
    } 
    
major_map ={ 
    'STEM'                   :    0,
    'Business Degree'        :    1, 
    'Arts'                   :    2, 
    'Humanities'             :    3, 
    'No Major'               :    4, 
    'Other'                  :    5 
}
    
experience_map = {
    '<1'      :    0,
    '1'       :    1, 
    '2'       :    2, 
    '3'       :    3, 
    '4'       :    4, 
    '5'       :    5,
    '6'       :    6,
    '7'       :    7,
    '8'       :    8, 
    '9'       :    9, 
    '10'      :    10, 
    '11'      :    11,
    '12'      :    12,
    '13'      :    13, 
    '14'      :    14, 
    '15'      :    15, 
    '16'      :    16,
    '17'      :    17,
    '18'      :    18,
    '19'      :    19, 
    '20'      :    20, 
    '>20'     :    21
} 
    
company_type_map = {
    'Pvt Ltd'               :    0,
    'Funded Startup'        :    1, 
    'Early Stage Startup'   :    2, 
    'Other'                 :    3, 
    'Public Sector'         :    4, 
    'NGO'                   :    5
}

company_size_map = {
    '<10'          :    0,
    '10/49'        :    1, 
    '100-500'      :    2, 
    '1000-4999'    :    3, 
    '10000+'       :    4, 
    '50-99'        :    5, 
    '500-999'      :    6, 
    '5000-9999'    :    7
}
    
last_new_job_map = {
    'never'        :    0,
    '1'            :    1, 
    '2'            :    2, 
    '3'            :    3, 
    '4'            :    4, 
    '>4'           :    5
}

In [None]:
# Transforming Categorical features into numarical features

data.loc[:,'education_level'] = data['education_level'].map(education_level_map)
data.loc[:,'company_size'] = data['company_size'].map(company_size_map)
data.loc[:,'company_type'] = data['company_type'].map(company_type_map)
data.loc[:,'last_new_job'] = data['last_new_job'].map(last_new_job_map)
data.loc[:,'major_discipline'] = data['major_discipline'].map(major_map)
data.loc[:,'enrolled_university'] = data['enrolled_university'].map(enrolled_university_map)
data.loc[:,'relevent_experience'] = data['relevent_experience'].map(relevent_experience_map)
data.loc[:,'gender'] = data['gender'].map(gender_map)
data.loc[:,'experience'] = data['experience'].map(experience_map)


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:

#encoding city feature using label encoder
lb_en = LabelEncoder()

data.loc[:,'city'] = lb_en.fit_transform(data.loc[:,'city']) 

In [None]:
data.info()

In [None]:
# check number of null values of every column in data

for col in data.columns:
    null_val = data[col].isnull().sum()
    null_prec = (null_val * 100) / data.shape[0]
    print('> %s , Missing: %d (%.1f%%)' % (col, null_val, null_prec))

In [None]:
# missing columns

missing_cols = data.columns[data.isna().any()].tolist()
missing_cols

In [None]:
#dataframe having features with missing values
df_missing = data[['enrollee_id'] + missing_cols]

#dataframe having features without missing values
df_non_missing = data.drop(missing_cols, axis = 1)

In [None]:
from sklearn.impute import KNNImputer

In [None]:
#k-Nearest Neighbour Imputation

knn_imputer = KNNImputer(n_neighbors = 3)

X = np.round(knn_imputer.fit_transform(df_missing))
#Rounding them because these are categorical features

df_missing = pd.DataFrame(X, columns = df_missing.columns)

In [None]:
#now lets join both dataframes 

data_fil = pd.merge(df_missing, df_non_missing, on = 'enrollee_id')

In [None]:
data_fil.head()

In [None]:
y = data_fil['target']
X=data_fil.drop('target', axis=1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=98)

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()

In [None]:
classifier.fit(X_train,y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
accuracy_score(y_test,y_pred)