In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_filepath = "/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv"
test_filepath = "/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv"
sub_filepath = "/kaggle/input/hr-analytics-job-change-of-data-scientists/sample_submission.csv"

train, test, sub = pd.read_csv(train_filepath), pd.read_csv(test_filepath), pd.read_csv(sub_filepath)

# getting a sample of the data
train.head()

In [None]:
cols = train.columns
print("all columns = {}".format(cols))
print("total data points = {}".format(train.shape[0]))

print("-------------Printing null count-------------")
for c in cols:
    print("{} =====> {}".format(c, train[c].isna().sum()))

In [None]:
# exclude_cols = ['enrollee_id', 'city', 'city_development_index', 'training_hours']
for c in cols:
#     if c in exclude_cols:
#         continue
    print("---------------cols = {}---------------".format(c))
    print("Unique values = {}".format(train[c].unique()))

<h2>Re-structuring relevant data</h2>

all columns = ['enrollee_id', 'city', 'city_development_index', 'gender',
               'relevent_experience', 'enrolled_university', 'education_level',
               'major_discipline', 'experience', 'company_size', 'company_type',
               'last_new_job', 'training_hours', 'target']

<h4> gender, company_size, major_discipline, company_type, education_level, relevent_experience can be converted into categorical data</h4>

<h4> city_development_index, 'training_hours', experience can be used as continuous data</h4>

<h4> enrollee_id, city are probably irrelevant</h4>

<h2> Feature extraction </h2>
<h4> 'stability-factor' of the candidate can be estimated from 'last_new_job' and 'experience' </h4>

<h4> a dictionary of city vs development index should be created. </h4>

<h2> Step 1: cleaning and restructuring the dataset</h2>

In [None]:
# create a city-development index map
city_devIndex_map = {}

allCities = train.city.unique()

for aCity in allCities:
    devIndex = train[ train.city == aCity ].city_development_index.iloc[0]
    city_devIndex_map[aCity] = devIndex


In [None]:
def clean(dataframe):
    
    # drop unwanted columns
    drop_cols = ['enrollee_id', 'city']
    cleaned_train = dataframe.drop(drop_cols, inplace = False, axis = 1)

    cleaned_cols = cleaned_train.columns
    
    print("columns = {}".format(cleaned_cols))

    # fill na with "none"
    cleaned_train = cleaned_train.fillna(value = "none", inplace = False, axis = 1)
    return cleaned_train

cleaned_train = clean(train)
cleaned_train.head()

<h2> Filling missing values in dataset, and re-valuing certain values </h2>

<h4>
    If there is any missing values, we assume that the candidate has not provided any value, implying that he/she does not have relevant value. Some of the columns that need re-evaluation of missing values are:<br>
    
    experience: 'none' should be mapped to 0. If exp is missing, we assume 0 exp <br>
    enrolled_university: 'no_enrollment' and 'none' should have same mapping, if univ enrollment details is missing, we assume no enrollment.<br>
    major_discipline: 'no_major' and 'none' should have same mapping, if major information is missing, we assume no major.
    
    
</h4>

In [None]:
# assuming 'none' exp  = 0 exp
def nonNumericContinuousData(dataframe):
    unique_exp = dataframe.experience.unique()
    print("non numeric exp data ====>", end=" ")
    for ue in unique_exp:
        try:
            exp = int(ue)
        except Exception as e:
            print(ue, end=", ")
    print()
    unique_lnj = dataframe.last_new_job.unique()
    print(unique_lnj)
    print("non numeric lnj data ====>", end=" ")
    for ulnj in unique_lnj:
        try:
            lnj = int(ulnj)
        except Exception as e:
            print(ulnj, end=", ")

nonNumericContinuousData(cleaned_train)

<h4>
    last_new_job has str values which are numeric, and str values indicating upper-range ">4". <br>
    The column would converted into numeric values, and ">4" would be replaced by numeric 5. Thus we get a continous numeric value column 
</h4>

In [None]:
cleaned_train.experience.replace('none', '0', inplace = True)
cleaned_train.experience.replace('>20', '21', inplace = True)
cleaned_train.experience.replace('<1', '0', inplace = True)
cleaned_train.experience = cleaned_train.experience.map(int)


# # replacing 'none' in enrolled_university with no_enrollment
cleaned_train.enrolled_university.replace('none', 'no_enrollment', inplace = True)
cleaned_train.major_discipline.replace('none', 'no_major', inplace = True)

print(type(cleaned_train))
# replacing certain last_new_job values
cleaned_train.last_new_job.replace('>4', 5, inplace = True)
cleaned_train.last_new_job.replace('none', 0, inplace = True)
cleaned_train.last_new_job.replace('never', 0, inplace = True)
cleaned_train.last_new_job = cleaned_train.last_new_job.map(int)


cleaned_train.head()

In [None]:
# restructuring gender, company_size, major_discipline, 
#                     company_type, education_level, experience, 
#                     relevent_experience, enrolled_university
# CAUTION: Using gender as a feature for identifying candidate fitness can cause gender-biasness

from sklearn.preprocessing import LabelEncoder

def encodeCols(dataframe):
    le = LabelEncoder()

    encoded_train = dataframe.copy(deep=True)

    categorical_cols = ['gender', 'company_size', 'major_discipline',
                         'company_type', 'education_level','relevent_experience',
                        'enrolled_university' ]
    print("---------Label mapping----------")
    for c in categorical_cols:    
        le.fit(encoded_train[c])
        labels = le.classes_
        encoded_train[c] = le.transform(encoded_train[c])
        index = range(len(labels))
        d = {k:v for k,v in zip(labels,index)}
        print("{} ====> {}".format(c, d))

    return encoded_train


encoded_train = encodeCols(cleaned_train)

In [None]:
print("Columns available after cleaning ===> {}".format(encoded_train.columns))

encoded_train.shape

<h2> Step 2: Perform EDA on the dataset to gain insights </h2>

Trying to answer some questions:<br>
*     Which of them are dependent variables? (Perform KDE-plot)
*     What percentage of non-STEM candidates are interested in the job? (perform CDF)

In [None]:
cleaned_train.head()

In [None]:
import seaborn as sns
sns.FacetGrid(cleaned_train, hue = 'target').map(sns.kdeplot, 'training_hours').add_legend()
sns.FacetGrid(cleaned_train, hue = 'target').map(sns.kdeplot, 'city_development_index').add_legend()
sns.FacetGrid(encoded_train, hue = 'target').map(sns.kdeplot, 'last_new_job').add_legend()
sns.FacetGrid(encoded_train, hue = 'target').map(sns.kdeplot, 'experience').add_legend()

* city_development_index is a dependent variable
* experience has huge overlap. REVIEW
* training_hours and last_new_job is not a dependent variable (huge overlap)

In [None]:
# sns.countplot(x = 'target', data = cleaned_train, hue = 'gender')
sns.countplot(x = 'gender', data = cleaned_train, hue = 'target')

<h2> building a basic classification model using LR </h2>

<h3> why LR? Because this is a binary classification problem, and should be solvable using LR </h3>

In [None]:
from sklearn import preprocessing

def preprocessData(dataframe):
    #Create a Pandas DataFrame of the hot encoded column
    categorical_cols = ['gender', 'relevent_experience',
                       'enrolled_university', 'education_level', 'major_discipline',
                       'company_size', 'company_type']

    onehotencoded = dataframe.copy()

    for c in categorical_cols:
        one_hot = pd.get_dummies(onehotencoded[c], prefix = c)
        onehotencoded = onehotencoded.drop(c,axis = 1)
        onehotencoded = pd.concat([onehotencoded, one_hot], axis=1)
#     onehotencoded.head()

    # Normalize values
    norm_cols = ['last_new_job', 'training_hours', 'experience']

    normed = onehotencoded.copy()

    for c in norm_cols:
        max_value = onehotencoded[c].max()
        min_value = onehotencoded[c].min()
        normed[c] = (onehotencoded[c] - min_value) / (max_value - min_value)

    return normed

normed = preprocessData(encoded_train)
print(normed.columns)
normed.head()

In [None]:
total_size = len(normed)
to_train = normed.iloc[0:int((0.9)*total_size)]
to_test = normed.iloc[len(to_train):]

(to_train.shape, to_test.shape)

In [None]:
to_trainX, to_trainY = to_train.loc[:, to_train.columns != 'target'], \
                        to_train.loc[:, 'target']
to_testX, to_testY = to_test.loc[:, to_test.columns != 'target'], \
                        to_test.loc[:, 'target']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

model = LogisticRegression(random_state=0, max_iter = 1000).fit(to_trainX, to_trainY)
model.score(to_testX, to_testY)