In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization 


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

df= pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
df.head(10)
df.shape
df.info()


df.isna().sum() # so lots of missing values are there

## Handling Missing values 

In [None]:

## Lets start by handling missing values
df['gender'].loc[df['gender'].isnull()==True]='Undefined' #since nan values is really high we will treat it as a seperate category
df.gender.value_counts()
df.gender.isna().sum()



df.enrolled_university .value_counts()
df.enrolled_university .isna().sum()
df['enrolled_university'].loc[df['enrolled_university'].isnull()==True]='no_enrollment'  
#Here the nan values are small so we just add it to majority class




df.education_level.value_counts()
df.education_level.isna().sum()
df['education_level'].loc[df['education_level'].isnull()==True]='Graduate' 
# the nan values are very small so we will just add into graduate category



df.major_discipline.value_counts()
df['major_discipline'].loc[df['major_discipline'].isnull()==True]='STEM'
df.major_discipline.isna().sum()




df.experience.replace('>20','22',inplace=True) # replacing special chars(like >,+) with numbers
df.experience.replace('<1','0',inplace=True)
df.experience=pd.to_numeric(df.experience)
df['experience']=np.where(df['experience']>10,'Senior-level', np.where(df['experience']>3,'Intermediate-level' ,'Entry-Level'))
# here we are creating a class interval for each level of experience
df['experience'].value_counts().sum() 



df.company_size.replace('<10','9',inplace=True) # replacing special chars(like >,+) with numbers
df.company_size.replace('10/49','20',inplace=True)
df.company_size.replace('50-99','55',inplace=True)
df.company_size.replace('100-500','300',inplace=True)
df.company_size.replace('10000+','10001',inplace=True)
df.company_size.replace('500-999','600',inplace=True)
df.company_size.replace('5000-9999','6000',inplace=True)
df.company_size.replace('1000-4999','3000',inplace=True)
df.company_size= pd.to_numeric(df.company_size)
df['company_size']=np.where(df['company_size']>2000,'Large-org.', np.where(df['company_size']>1,'Small & Medium-org.','Undefined'))
# here we are creating a class interval for various company sizes
df['company_size'].value_counts()


df.company_type.value_counts()
df.company_type.isna().sum()
df['company_type'].loc[df['company_type'].isnull()==True]='Pvt Ltd'




df.last_new_job.value_counts()      
df.last_new_job.replace('>4','5',inplace=True)
df.last_new_job.replace('never','0',inplace=True)
df.last_new_job.fillna(1,inplace=True)    #Replace with majority category of 1 year diff
df.last_new_job=df.last_new_job.astype(int)

df.isna().sum() # no column has nan value left we can proceed now

## Encoding Categorical variables

In [None]:

## Time to encode the categories

categorical_cols=[name for name in df.columns if df[name].dtype not in ['int','float']]  
categorical_cols
#fetches categorical features needed to be encoded



new_df=df.copy() #lets make a new df to which we want to make changes

new_df.relevent_experience= new_df.relevent_experience.map({"Has relevent experience":1,"No relevent experience":0})



#Ordinal encoder because these categories follow a rank like 1st,2nd,3rd etc
from sklearn.preprocessing import OrdinalEncoder
enc_ordered_cat=OrdinalEncoder(categories=[['Undefined',"Small & Medium-org.",'Large-org.'],["Entry-Level","Intermediate-level" ,'Senior-level'],['Primary School','High School',"Graduate",'Masters','Phd']])
ordinal_feat=enc_ordered_cat.fit_transform(new_df[['company_size','experience','education_level']])
enc_ordered_cat.categories_



#OneHot encode because these categories are independent of each other
one_hot_cat=new_df[['gender','major_discipline', 'company_type','enrolled_university']]
one_hot=pd.get_dummies(one_hot_cat)


new_df.drop(["company_type","gender","major_discipline","enrolled_university"],axis=1,inplace=True)
new_df.drop(['company_size','experience','education_level'],axis=1,inplace=True)


## Combining all dataframes to get a final df

In [None]:

full_df=pd.concat([new_df,one_hot],axis=1)


ordinal_feat_df= pd.DataFrame(data=ordinal_feat,dtype='int32')      #converting the np array of ordinal encoded feat to dataframe
ordinal_feat_df.columns=['company_size','experience','education_level']

all_df= pd.concat([full_df,ordinal_feat_df],axis=1) # here is the final dateframe that we are going to work with 
all_df.head() 


all_df.drop(['enrollee_id', 'city'],inplace=True,axis=1)
all_df.head()

## Correlation of features with target class 

In [None]:
from sklearn.preprocessing import scale  #training hours can have a magnitude as it shows relatively high numbers

all_df['training_hours']=scale(all_df['training_hours'])     #scales down to unit variance
all_df['training_hours']= np.floor(all_df['training_hours'])

all_df.corr()['target']

## Treating imbalanced targets


In [None]:
y=all_df['target']
X=all_df.drop('target',axis=1)


y.value_counts()   # the number people that will change job are way less than those who wont -DATA IS IMBALANCED
sns.countplot(y) 


from imblearn.over_sampling import RandomOverSampler
from collections import Counter


rand=RandomOverSampler(random_state=42)
x_ros, y_ros = rand.fit_resample(X, y)
print(f"Imbalanced target class: {Counter(y)} Balanced target class:{Counter(y_ros)}")


## Training and testing data  split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x_ros,y_ros,test_size=0.3,shuffle=True)
#make sure shuffle is set to true because we dont want to get data belonging to one class

## Model building

In [None]:
from sklearn.ensemble import RandomForestClassifier  # In my opinion RF will be ideal as it we have many feat like yes or no questions which can be use in decision making 
rf=RandomForestClassifier()
rf.get_params() #fetches rf params

rf.fit(X_train,y_train)

predictions= rf.predict(X_test)

## Lets see how well our model performed!

In [None]:

from sklearn.metrics import roc_auc_score,f1_score,accuracy_score,classification_report
print(f"Roc-Auc score: {roc_auc_score(y_test,predictions)},f1_score: {f1_score(y_test,predictions)},Accuracy: {accuracy_score(y_test,predictions)}")
print(classification_report(y_test,predictions))
# The precision-recall, accuracy scores are very promising but wait until we cross check it with the validation set


## Cross validation

In [None]:
from sklearn.model_selection import cross_val_score,StratifiedKFold        # lets validate our val_accuracy
skfold = StratifiedKFold(n_splits=3, random_state=42,shuffle=True)
scores=cross_val_score(rf,X_test,y_test,cv=skfold)
print("best score:{:.3f}".format(np.mean(scores)))              # validation looks good

## Testing on new testset

In [None]:
all_df.to_csv("all_test.csv")
new_test= pd.read_csv('./all_test.csv')
new_test_X=new_test.drop(["Unnamed: 0",'target'],axis=1)
new_test_y=new_test["target"]

In [None]:
pred=rf.predict(new_test_X) 
#fitting our random forest into new test data and geting the probability of candidate leaving or staying 
pred

In [None]:

from sklearn.metrics import roc_auc_score,f1_score,accuracy_score,classification_report
print(f"Roc-Auc score: {roc_auc_score(new_test_y,pred)},f1_score: {f1_score(new_test_y,pred)},Accuracy: {accuracy_score(new_test_y,pred)}")
print(classification_report(new_test_y,pred))

In [None]:
from sklearn.model_selection import cross_val_score,StratifiedKFold  # lets validate our val_accuracy
skfold = StratifiedKFold(n_splits=5, random_state=42,shuffle=True)
scores=cross_val_score(rf,new_test_X,new_test_y,cv=skfold)
print("best score:{:.3f}".format(np.mean(scores)))

In [None]:
import pickle
# save your precious model :)
filename = 'finalized_model.sav'
pickle.dump(rf, open(filename, 'wb'))

#### Please upvote this so i can make moreof these. Cheers!!