In [1]:
#importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score

In [2]:
#code for custom scaler
#for scaling cselected columns in a dataframe
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin): 
        
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    #
    def transform(self, X, y=None, copy=None):
        
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [3]:
#importing the required csv
raw_dataset=pd.read_csv('HR_train_set.csv')
saved_raw_dataset=raw_dataset

#the mode and median of the education column is Bacheclor's and hence we impute the missing values with Bachelor's
raw_dataset['education']=raw_dataset['education'].fillna("Bachelor's")

#for imputing previous years rating we used mode.
#mode of the dataset=3

#raw_dataset['previous_year_rating']=raw_dataset['previous_year_rating'].fillna(raw_dataset['previous_year_rating'].mean())
raw_dataset['previous_year_rating']=raw_dataset['previous_year_rating'].fillna(3.0)

#saving the dataset with no null values
df_nonull=raw_dataset

#one hot encoding for departments
df_nonull=pd.concat([df_nonull,pd.get_dummies(df_nonull['department'], prefix='dept', drop_first=True)], axis=1)
df_nonull.drop(['department'], axis=1, inplace=True)

#binary label encoding for gender
df_nonull['gender']=df_nonull['gender'].map({'f':0, 'm':1})

#label encoding for education
df_nonull['education']=df_nonull['education'].map({"Master's & above":2, "Bachelor's":1, 'Below Secondary':0})

#one hot encoding for recruitment_channel
df_nonull=pd.concat([df_nonull,pd.get_dummies(df_nonull['recruitment_channel'], prefix='recruit', drop_first=False)], axis=1)
df_nonull.drop(['recruitment_channel'], axis=1, inplace=True)
df_nonull.drop(['recruit_other'], axis=1, inplace=True)

#label encoding for region
df_nonull['region']=df_nonull['region'].map({'region_7':7, 'region_22':22, 'region_19':19, 'region_23':23, 'region_26':26,
                                             'region_2':2, 'region_20':20, 'region_34':34, 'region_1':1, 'region_4':4,
                                             'region_29':29, 'region_31':31, 'region_15':15, 'region_14':14, 'region_11':11,
                                             'region_5':5, 'region_28':28, 'region_17':17, 'region_13':13, 'region_16':16,
                                             'region_25':25, 'region_10':10, 'region_27':27, 'region_30':30, 'region_12':12,
                                             'region_21':21, 'region_8':8, 'region_32':32, 'region_6':6, 'region_33':33,
                                             'region_24':24, 'region_3':3, 'region_9':9, 'region_18':18})

#dropping employee id
df_nonull.drop(['employee_id'],axis=1,inplace=True)

#defining targets and inputs
targets=df_nonull['is_promoted']
df_nonull.drop(['is_promoted'],axis=1,inplace=True)
unscaled_inputs=df_nonull

#custom scalling the required features
columns_to_omit = ['gender', 'KPIs_met >80%', 'awards_won?', 'dept_Finance',
                   'dept_HR', 'dept_Legal', 'dept_Operations', 'dept_Procurement',
                   'dept_R&D', 'dept_Sales & Marketing', 'dept_Technology',
                   'recruit_referred', 'recruit_sourcing', 'education']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

HR_scaler = CustomScaler(columns_to_scale)
HR_scaler.fit(unscaled_inputs)
scaled_inputs = HR_scaler.transform(unscaled_inputs)

#splitting the targets and features into train and test set
#x_train, x_test, y_train, y_test=train_test_split(scaled_inputs, targets, test_size=0.2, random_state=42)
x_train=scaled_inputs
y_train=targets

#applying logistic regression
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

#predicting values
y_hat_train = logisticRegr.predict(x_train)
#y_hat_test = logisticRegr.predict(x_test)

#scores
#score_test = logisticRegr.score(x_test, y_test)
score_train = logisticRegr.score(x_train, y_train)

print(score_train)
#print(score_test)

#creating a summary table to know about the importances of each feature
    #feature_name = unscaled_inputs.columns.values
    #summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)
    #summary_table['Coefficient'] = np.transpose(logisticRegr.coef_)
    #summary_table.index = summary_table.index + 1
    #summary_table.loc[0] = ['Intercept', logisticRegr.intercept_[0]]
    #summary_table = summary_table.sort_index()
    #summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
    #summary_table=summary_table.sort_values('Odds_ratio', ascending=False)
    #summary_table

#assembling the prediction in the form of a dataframe
#results = pd.DataFrame (columns=['employee_id'], data = raw_dataset['employee_id'])
#results['is_promoted'] = np.transpose(y_hat_train)
#results.to_csv('results.csv', index=False)

In [4]:
testing_data=pd.read_csv('HR_test_set.csv')

testing_data['education']=testing_data['education'].fillna("Bachelor's")
testing_data['previous_year_rating']=testing_data['previous_year_rating'].fillna(3.0)

testing_data=pd.concat([testing_data,pd.get_dummies(testing_data['department'], prefix='dept', drop_first=True)], axis=1)
testing_data.drop(['department'], axis=1, inplace=True)

testing_data['gender']=testing_data['gender'].map({'f':0, 'm':1})
testing_data['education']=testing_data['education'].map({"Master's & above":2, "Bachelor's":1, 'Below Secondary':0})

testing_data=pd.concat([testing_data,pd.get_dummies(testing_data['recruitment_channel'], prefix='recruit', drop_first=False)], axis=1)
testing_data.drop(['recruitment_channel'], axis=1, inplace=True)
testing_data.drop(['recruit_other'], axis=1, inplace=True)

testing_data['region']=testing_data['region'].map({'region_7':7, 'region_22':22, 'region_19':19, 'region_23':23, 'region_26':26,
                                             'region_2':2, 'region_20':20, 'region_34':34, 'region_1':1, 'region_4':4,
                                             'region_29':29, 'region_31':31, 'region_15':15, 'region_14':14, 'region_11':11,
                                             'region_5':5, 'region_28':28, 'region_17':17, 'region_13':13, 'region_16':16,
                                             'region_25':25, 'region_10':10, 'region_27':27, 'region_30':30, 'region_12':12,
                                             'region_21':21, 'region_8':8, 'region_32':32, 'region_6':6, 'region_33':33,
                                             'region_24':24, 'region_3':3, 'region_9':9, 'region_18':18})

unscaled_inputs_final=testing_data.drop(['employee_id'],axis=1)
scaled_inputs_final = HR_scaler.transform(unscaled_inputs_final)

y_test= logisticRegr.predict(scaled_inputs_final)


results = pd.DataFrame (columns=['employee_id'], data =testing_data['employee_id'])
results['is_promoted'] = np.transpose(y_test)
#results.to_csv('results.csv', index=False)
#results