#  <center> HR Analytics: Job change of Data scientists</center>
<center>Predict who will move to a new job.</center>
<a href=https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists>Kaggle link </a><br>
<b>Features</b>
<ul>
<li>enrollee_id : Unique ID for candidate</li>
<li>city: City code</li>
<li>city_ development _index : Developement index of the city (scaled)</li>
<li>gender: Gender of candidate</li>
<li>relevent_experience: Relevant experience of candidate</li>
<li>enrolled_university: Type of University course enrolled if any</li>
<li>education_level: Education level of candidate</li>
<li>major_discipline :Education major discipline of candidate</li>
<li>experience: Candidate total experience in years</li>
<li>company_size: No of employees in current employer's company</li>
<li>company_type : Type of current employer</li>
<li>lastnewjob: Difference in years between previous job and current job</li>
<li>training_hours: training hours completed</li>
<li>target: 0 – Not looking for job change, 1 – Looking for a job change</li>
    </ul>

In [None]:
#import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from IPython.display import HTML,display
from wordcloud import WordCloud
from tqdm import tqdm

import mlxtend
from mlxtend.preprocessing import standardize

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,roc_curve,auc
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV


In [None]:
#Visualization settings
sns.set_style(style='white')
sns.set(rc={
    'figure.figsize': (12,7),
    'axes.facecolor': 'white',
    'axes.grid': True,
    'grid.color': '.9',
    'axes.linewidth': 1.0,
    'grid.linestyle': u'-'},
    font_scale=1.5)
custom_colors=["#3498db", "#95a5a6","#34495e", "#2ecc71", "#e74c3c"]
sns.set_palette(custom_colors)

#  <center>Dataset description</center>

In [None]:
#For kaggle environment, enter the default input path here.
input_dir='../input/hr-analytics-job-change-of-data-scientists/'

In [None]:
df_train=pd.read_csv(input_dir+'aug_train.csv')
df_test=pd.read_csv(input_dir+'aug_test.csv')
#Store the enrollee id for the future use.
test_file_column_id=df_test.enrollee_id
print (f"Training data loaded with {df_train.shape[0]} rows and columns {df_train.shape[1]}")
print (f"Test data loaded with {df_test.shape[0]} rows and columns {df_test.shape[1]}")
print ("Sampledata training data frame:")
df_train.head(3)

In [None]:
print ("Sampledata test dataframe:")
df_test.head(3)

In [None]:
col_tags="<ol><b>" + "".join([f"<li>{col}</li>" for col in df_train.columns]) + "</b></ol>"
display(HTML("<b><u>Feature names</u></b>"))
display(HTML(col_tags))
print ("Dataset summary")
df_train.info()

<h4 style='background-color:yellow'>
2 float <br>
2 integer<br>
10 string features
</h4>

In [None]:
val=df_train.isnull().sum()
val.sort_values(inplace=True,ascending=False)
df=pd.DataFrame(columns=["Features","NullCount"])
df["Features"]=val.index
df["NullCount"]=val.values
df.drop(df[df["NullCount"]==0].index,inplace=True)
print (df)
sns.barplot(data=df,x="Features",y="NullCount")
plt.xticks(rotation=90)
plt.title("Features having missing values");

#  <center>Exploratory Data Analysis</center>

In [None]:
#Discrete feature analysis
def analyze_discrete_feature(fld):
    df=pd.DataFrame({"Value": fld.value_counts().index,
                 "Count":fld.value_counts().values})
    print ("Null value count : ", fld.isnull().sum())
    unique_list=fld.unique().tolist()
    print ("\nUnique values: ", unique_list)
    print ("\n Unique values count: ", len(unique_list))
    print ("\nValue counts:\n",    df)
    plt.subplots(figsize=(25,10))
    plt.subplot(2,2,1)
    plt.pie(fld.value_counts(),labels=fld.value_counts().index,autopct=lambda x: f'{x: .2f}%');
    plt.xticks(rotation=90)
    plt.subplot(2,2,2)
    
    sns.barplot(data=df, x="Value",y="Count");
    plt.xticks(rotation=90);
    plt.suptitle(fld.name + " -distribution");
    plt.show()
    plt.close()
    display(HTML("<h4 style='background-color:yellow'>Discrete variable</h4>"))

In [None]:
#Continuous feature analysis
def analyze_continuous_feature(fld):
    print ("Null value count : ", fld.isnull().sum())
    print ("\n", fld.describe())
    plt.subplots(figsize=(25,10))
    plt.subplot(2,2,1)
    plt.hist(fld)
    plt.subplot(2,2,2)
    sns.boxplot(fld)
    plt.suptitle("fld.name + -distribution")
    plt.show()
    plt.close()
    display(HTML("<h4 style='background-color:yellow'>Continuous variable</h4>"))
    

### 1. enrollee Id

In [None]:
df_train.enrollee_id.describe()

<h4 style='background-color:yellow'>
Integer feature. <br>
Identifier of the rows.
</h4>

### 2. City

In [None]:
wc=WordCloud(background_color='white').generate(str(df_train.city.values))
plt.figure(figsize=(10,10),facecolor=None)
plt.imshow(wc,interpolation='bilinear');
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
analyze_discrete_feature(df_train.city)

### 3. city_development_index

In [None]:
analyze_continuous_feature(df_train.city_development_index)

### 4. Gender

In [None]:
analyze_discrete_feature(df_train.gender)

<h4 style='background-color:yellow'>
    Majority is the male candidates.
</h4>

### 5. relevent_experience

In [None]:
analyze_discrete_feature(df_train.relevent_experience)

<h4 style='background-color:yellow'>
Most of the candidates having the experience.
</h4>


### 6. enrolled_university

In [None]:
analyze_discrete_feature(df_train.enrolled_university)

### 7. education_level

In [None]:
analyze_discrete_feature(df_train.education_level)

<h4 style='background-color:yellow'>
Most of the people are graduates.<br>
Very less people having Ph.D
</h4>

### 8. major_discipline

In [None]:
analyze_discrete_feature(df_train.major_discipline)

<h4 style='background-color:yellow'>
    Most of the candidates are from STEM (Science Technology Engineering and Mathematics) background.
</h4>


### 9. experience

In [None]:
analyze_discrete_feature(df_train.experience)

### 10. Company_Size

In [None]:
analyze_discrete_feature(df_train.company_size)

### 11. company_type

In [None]:
analyze_discrete_feature(df_train.company_type)

### 12. last_new_job

In [None]:
analyze_discrete_feature(df_train.last_new_job)

### 13. training_hours

In [None]:
analyze_continuous_feature(df_train.training_hours)

### 14. target (Target Feature)

In [None]:
analyze_discrete_feature(df_train.target)

<h4 style='background-color:yellow'>
Target data is more favouring towards 0 class.
</h4>

#  <center> Feature relationships</center>

### 1. Gender preference for company types

In [None]:
plt.subplots(figsize=(20,15))
plt.suptitle("Gender presence");
plt.subplots_adjust(hspace=1.5);

plt.subplot(2,2,1)
sns.countplot(data=df_train,x="company_type",hue="gender",palette=['#add8e6',"#FFC0CB","#808080"]);
plt.xticks(rotation=90)
plt.title("In Companies");

plt.subplot(2,2,2)
sns.countplot(data=df_train,x="education_level",hue="gender",palette=['#add8e6',"#FFC0CB","#808080"]);
plt.xticks(rotation=90);
plt.title("In Education");

plt.subplot(2,2,3);
sns.countplot(data=df_train,x="major_discipline",hue="gender",palette=['#add8e6',"#FFC0CB","#808080"]);
plt.xticks(rotation=90)
plt.title("In Major Discipline");

plt.subplot(2,2,4);
sns.countplot(data=df_train,x="company_size",hue="gender",palette=['#add8e6',"#FFC0CB","#808080"]);
plt.xticks(rotation=90)
plt.title("Company Size");

<h4 style='background-color:yellow'>
    In all company firms , Male dominance is more.<br>
    Private companies have more male population.<br>
    In all the education levels male dominance is more. <br>
    Females tends to work with company having 50-99 employee strength.<br>
</h4>

### 2. Educational qualifications hold by different types of company types.

In [None]:
df=df_train.loc[:,["company_type","education_level"]]
df=pd.crosstab(df.company_type,df.education_level)

In [None]:
sns.heatmap(df,annot=True,fmt="g",cmap='Reds',cbar=False);
plt.title("Graduation levels vs. Company type");

<h4 style='background-color:yellow'>
Most of the people with all kinds of education levels tends to join in Private firms.<br>
</h4>

### 3. Experince vs. chances of job swich.

In [None]:
del(df)
df=df_train.copy()

In [None]:
df['experience'].replace(['<1','>20'],[0,21],inplace=True)
df["experience"]=pd.to_numeric(df.experience, errors='coerce')

In [None]:
def assign_experience_bin (fld):
    if fld <=5:
        return "beginer"
    elif (fld >5 and fld <=10):
        return "ascociate"
    elif (fld >10 and fld <=15):
        return "senior"
    elif (fld >15 and fld <=20):
        return "supersenior"
    elif fld > 20:
        return "experienced"
            

In [None]:
df["experience_bin"]=df['experience'].apply(assign_experience_bin)
df1=df.loc[:,['experience_bin','target','gender']]

In [None]:
sns.catplot(data=df1,
           y="experience_bin",
           hue="target",
           kind="count",
           col="gender");

<h3 style='background-color:yellow'>
    People with experience level 0-15 tends to switch the job mostly. <br>
    People with experience level 25-20 will less likely switch the job compared to other experience level categories.
</h3>

In [None]:
del([df,df1])

#  <center> Data Imputation</center>

In [None]:
def process_imputation(df):
    cmpny_type_mode=df['company_type'].mode()[0]
    df['company_type'].fillna(cmpny_type_mode,inplace=True)
    print (f"company_type missing values filled with: {cmpny_type_mode}\n")
    unique_cmpny_type=df.company_type.unique()
    group_cmpny_type=df.groupby('company_type')
    for cmpny in unique_cmpny_type:
        df_group=group_cmpny_type.get_group(cmpny)
        company_size_mode=df_group['company_size'].mode()[0]
        idx=df_group[pd.isnull(df_group["company_size"])].index
        df.loc[idx,"company_size"]=company_size_mode
        print (f"For company type {cmpny}, missing values for company size filled with: {company_size_mode}\n")
        
    gender_mod = df["gender"].mode()[0]
    df["gender"].fillna(gender_mod,inplace=True)
    print (f"Missing values for gender filled with: {gender_mod}\n")
    
    major_disciplie_mode=df["major_discipline"].mode()[0]
    df["major_discipline"].fillna(major_disciplie_mode,inplace=True)
    print (f"Missing values for major_discipline filled with: {major_disciplie_mode}\n")
    
    education_lvl_mod=df["education_level"].mode()[0]
    df['education_level'].fillna(education_lvl_mod,inplace=True)
    print(f"Missing values for education level filled with: {education_lvl_mod}\n")
    
    last_new_job_mode=df["last_new_job"].mode()[0]
    df["last_new_job"].fillna(last_new_job_mode,inplace=True)
    print (f"Missing values for last_new_job filled with: {last_new_job_mode}\n")
    
    enrolled_university_mode=df["enrolled_university"].mode()[0]
    df["enrolled_university"].fillna(enrolled_university_mode,inplace=True)
    print(f"Missing values for enrolled_university filled with: {enrolled_university_mode}\n")
    
    experience_mode=df["experience"].mode()[0]
    df["experience"].fillna(experience_mode,inplace=True)
    print(f"Missing values for experience filled with {experience_mode}\n")
    return df

In [None]:
#Process imputation for training data
df_train=process_imputation(df_train)

In [None]:
print ("Is there any missing values in training data?")
df_train.isnull().any()

In [None]:
#Process imputation for testing data
df_test=process_imputation(df_test)

In [None]:
print ("Is there any missing values in testing data?")
df_test.isnull().any()

<h3 style="background-color:yellow">
Missing values removed from training and test dataset.
    </h3>

#  <center>Data wrangling    </center>

In [None]:
def wrangle_data(df):
    #Drop below feature since its not contributing much to model training.
    drop_features=["enrollee_id"]
    df.drop(drop_features,axis=1,inplace=True)

    #Create 3 major bins for city_development_index using the city_development_index
    #bin names : poor,average,high
    def get_bin(inp):
        bin_range=np.linspace(start=0,stop=1,num=4)
        if inp <= bin_range[1]:
            return 1
        elif bin_range[2] >= inp > bin_range[1]:
            return 2
        elif inp <= bin_range[3]:
            return 3
    df["city_development_index_bin"]=df["city_development_index"].apply(get_bin)
    df.drop("city_development_index",inplace=True,axis=1)
    
    df['experience'].replace(['>20','<1'],[21,0],inplace=True)
    df['experience']=df['experience'].apply(pd.to_numeric)
    
    df['company_size'].replace(['<10','10/49','50-99','100-500','500-999','1000-4999','5000-9999','10000+'],
                              [1,2,3,4,5,6,7,8],inplace=True)
    
    df['last_new_job'].replace(['never','1','2','3','4','>4'],list(range(0,6)),inplace=True)
    
    #Standardize continuous features
    df['training_hours']=standardize(df['training_hours'])
   
    df=pd.get_dummies(df)

    return df
    

In [None]:
#Changing the index of df_test, for creating a total dataframe by appending to training data frame.
#It's better to include the test dataframe while wrangling data for considering label encoding for entire train + test data
start_index=df_train.index.max()+1
end_index=df_test.index.max()+1
new_index_list=list(range(start_index,start_index+end_index))

#Assigning new index, which is starting from end of training index.
df_test.index=new_index_list

In [None]:
#df_total=df_train + df_test
df_total=pd.concat([df_train,df_test],axis=0)
print (f"Shape total data frame:{df_total.shape}")

In [None]:
#Process wrangling on the total data frame.
df_total=wrangle_data(df_total)

In [None]:
print ("Column names in df_total:\n")
df_total.columns

In [None]:
#df_total.describe().to_csv("dummy.csv")

In [None]:
#Drop the test data frame from df_input
df_train=df_total.drop(new_index_list,axis=0)
print (f"Shape of df_train:{df_train.shape}")
df_test=df_total.loc[new_index_list]
df_test.drop("target",axis=1,inplace=True)
print (f"Shape of df_test:{df_test.shape}")

In [None]:
x=df_train.drop('target',axis=1)
y=df_train.target
print (f"Shape of independent features {x.shape} \nShape of dependent feature {y.shape}")

#  <center>Model evaluation</center>

In [None]:
#Function for getting the classification report, confusion matrix Area under curve AOC,ROC details
def get_perfomance_details(y_pred,y_test):
    #Classification report
    print ("Classification Report:\n")
    print (classification_report(y_true=y_test,y_pred=y_pred))
    
    #Draw confusion matrix
    sns.reset_defaults()
    plt.figure(figsize=(7,4));
    sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,cmap="GnBu",fmt="g",cbar=False);
    plt.title("Confusion Matrix");
    plt.show()
    
    #Find the Area under the curve
    print ("\n----------------\nAOC ROC details\n----------------\n")
    rocauc_score=roc_auc_score(y_pred,y_test)
    #ROC curve
    fpr,tpr,_=roc_curve(y_test,y_pred)
    roc_aoc=auc(fpr,tpr)
    print (f"AUC score: {rocauc_score}\nTrue positive rate: {tpr}\nFalse postive rate: {fpr}")

    #Draw the ROC curve
    plt.figure(figsize=(4,4));
    lw=2
    plt.plot(fpr,tpr,
             color='green',
             lw=lw,
             label='ROC curve (area=%0.4f)' % roc_aoc);
    
    #plot diagonal line  from (0,0) to (1,1), represents fpt=tpr
    plt.plot([0,1],[0,1],color='lightgrey',lw=lw,linestyle='--');
    plt.xlim([0.0,1.0]);
    plt.ylim([0.0,1.0]);
    plt.xlabel("False Positive Rate");
    plt.ylabel("True Postive Rate");
    plt.title("Reciever operating characteristic for training data");
    plt.legend(loc='lower right')
    plt.show()

In [None]:
df_model_results=pd.DataFrame(columns=["ModelName","TrainScore"])

def store_model_results(modl_name,train_score):
    global df_model_results
    row_loc=df_model_results.shape[0]+1
    df_model_results.loc[row_loc,["ModelName","TrainScore"]]=[modl_name,train_score]


In [None]:
df_model_results=df_model_results.iloc[0:0]
#Since this training set takes some time for training, included pregress bar using tqdm library
for model in  tqdm([LogisticRegression(), 
               SVC(),
               DecisionTreeClassifier(),
               RandomForestClassifier(),
               KNeighborsClassifier(),
               XGBClassifier()]):
    store_model_results(model.__class__.__name__,
                       cross_val_score(model,x,y,cv=3).mean())
df_model_results.sort_values("TrainScore",ascending=False,inplace=True)
selected_model=df_model_results.head(1)["ModelName"].values[0]
display(HTML('<h3>Selected_model: '+ selected_model + '</h3>'))
df_model_results

In [None]:
print ("Train Test file splitting..")
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1)
print (f"File shapes \nx_train:{x_train.shape} y_train:{y_train.shape}\nx_test:{x_test.shape} y_test:{y_test.shape}")

In [None]:
model=SVC()
model.fit(x_train,y_train)
print ("Predicting..")
y_pred=model.predict(x_test)
print ("prediction completed.")

In [None]:
get_perfomance_details(y_pred,y_test)

<i>
Precision tp/(tp+fp) : measures the ability of a classifier to identify only the correct instances for each class.<br>
Recall tp/(tp+fn) : is the ability of the classifier to find all correct instances per class.<br>
F1 Score = 2 (precision recall)/ (precision + recall)<br>
F1 score is considered a better indicator of the classifier's performance than the regular accuracy measure.<br>
Support is the number of actual occurances of the class in the test data set.<br>
    </i>

#   Model tuning 

In [None]:
#Lets find best hyper parameter for SVC
param_grid={'C':[100,1000,10000],
           'gamma':[0.001,0.0001,0.00001]}


In [None]:
grid=GridSearchCV(SVC(),param_grid=param_grid,refit=True,verbose=True,n_jobs=10)

In [None]:
grid.fit(x_train,y_train)
print ("Best parameters from GridSearch: ",grid.best_params_)

In [None]:
y_pred=grid.predict(x_test)
print ("Prediction completed.")

In [None]:
get_perfomance_details(y_pred,y_test)

#### Since the Area under the curve is better than the initial model, we can use this model for further prediction.

#  <center> Test file prediction </center>

In [None]:
print (f"Shape of the test file {df_test.shape}")

In [None]:
x=df_test
print (f"Shape of independent features {x.shape}")

In [None]:
y_pred=grid.predict(x)

In [None]:
y_pred=pd.DataFrame({"enrollee_id":test_file_column_id,
                   "target":pd.Series(y_pred)})

In [None]:
y_pred.to_csv('submission.csv',index=False)
print ("Exported result")