In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
aug_train = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
aug_test = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv')

In [None]:
aug_train.head()

In [None]:
aug_test.head()

In [None]:
aug_train.columns

In [None]:
aug_train.describe().T

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.pairplot(aug_train)
plt.show()

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(aug_train.corr())
plt.show()

In [None]:
aug_train.isnull().sum()

In [None]:
aug_train = aug_train.dropna()

In [None]:
gender_map = {
        'Female': 2,
        'Male': 1,
        'Other': 0
         }

relevent_experience_map = {
    'Has relevent experience':  1,
    'No relevent experience':    0
}

enrolled_university_map = {
    'no_enrollment'   :  0,
    'Full time course':    1, 
    'Part time course':    2 
}
    
education_level_map = {
    'Primary School' :    0,
    'Graduate'       :    2,
    'Masters'        :    3, 
    'High School'    :    1, 
    'Phd'            :    4
    } 
    
major_map ={ 
    'STEM'                   :    0,
    'Business Degree'        :    1, 
    'Arts'                   :    2, 
    'Humanities'             :    3, 
    'No Major'               :    4, 
    'Other'                  :    5 
}
    
experience_map = {
    '<1'      :    0,
    '1'       :    1, 
    '2'       :    2, 
    '3'       :    3, 
    '4'       :    4, 
    '5'       :    5,
    '6'       :    6,
    '7'       :    7,
    '8'       :    8, 
    '9'       :    9, 
    '10'      :    10, 
    '11'      :    11,
    '12'      :    12,
    '13'      :    13, 
    '14'      :    14, 
    '15'      :    15, 
    '16'      :    16,
    '17'      :    17,
    '18'      :    18,
    '19'      :    19, 
    '20'      :    20, 
    '>20'     :    21
} 
    
company_type_map = {
    'Pvt Ltd'               :    0,
    'Funded Startup'        :    1, 
    'Early Stage Startup'   :    2, 
    'Other'                 :    3, 
    'Public Sector'         :    4, 
    'NGO'                   :    5
}

company_size_map = {
    '<10'          :    0,
    '10/49'        :    1, 
    '100-500'      :    2, 
    '1000-4999'    :    3, 
    '10000+'       :    4, 
    '50-99'        :    5, 
    '500-999'      :    6, 
    '5000-9999'    :    7
}
    
last_new_job_map = {
    'never'        :    0,
    '1'            :    1, 
    '2'            :    2, 
    '3'            :    3, 
    '4'            :    4, 
    '>4'           :    5
}

In [None]:
aug_train.loc[:,'education_level'] = aug_train['education_level'].map(education_level_map)
aug_train.loc[:,'company_size'] = aug_train['company_size'].map(company_size_map)
aug_train.loc[:,'company_type'] = aug_train['company_type'].map(company_type_map)
aug_train.loc[:,'last_new_job'] = aug_train['last_new_job'].map(last_new_job_map)
aug_train.loc[:,'major_discipline'] = aug_train['major_discipline'].map(major_map)
aug_train.loc[:,'enrolled_university'] = aug_train['enrolled_university'].map(enrolled_university_map)
aug_train.loc[:,'relevent_experience'] = aug_train['relevent_experience'].map(relevent_experience_map)
aug_train.loc[:,'gender'] = aug_train['gender'].map(gender_map)
aug_train.loc[:,'experience'] = aug_train['experience'].map(experience_map)

#encoding city feature using label encoder
from sklearn.preprocessing import LabelEncoder
lb_en = LabelEncoder()

aug_train.loc[:,'city'] = lb_en.fit_transform(aug_train.loc[:,'city']) 

In [None]:
train = aug_train[['city', 'city_development_index', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job', 'training_hours']]

In [None]:
target = aug_train[['target']]

In [None]:
import sklearn.model_selection as model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(train, target, test_size=0.35, random_state=101)

In [None]:
y_train

In [None]:
X_train

In [None]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1, n_jobs=-1)
clf3 = GaussianNB()
clf4 = XGBClassifier(random_state=1,learning_rate=0.01)
clf5 = SVC(kernel = "poly")

clf_voting = VotingClassifier(
    estimators=[
        ('lr', clf1),
        ('rf', clf2),
        ('gnb', clf3),
        ('xgb', clf4),
    ])

pipe = make_pipeline(
    SimpleImputer(),
    clf_voting
)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [None]:
y_pred

In [None]:
pipe.score(X_test, y_test)