#### Imports

In [85]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

import math

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.utils import resample

In [2]:
df_main = pd.read_csv('Data.csv')

In [7]:
df_main.head()

Unnamed: 0,ID,Salary,DOJ,DOL,Original Designation,Designation,Domain,JobCity,Gender,DOB,10percentage,10board,12graduation,12percentage,12board,CollegeID,CollegeTier,Degree,Specialization,collegeGPA,CollegeCityID,CollegeCityTier,CollegeState,GraduationYear,English,Logical,Quant,DomainScore,ComputerProgramming,ElectronicsAndSemicon,ComputerScience,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience
0,42418,435000.0,5/1/2010 0:00,6/1/2014 0:00,electrical engineer,electrical engineer,Engineering,Bhopal,f,10/23/1987 0:00,65.0,cbse,2005,54.0,cbse,402,1,B.Tech/B.E.,biomedical engineering,65.9,402,0,Madhya Pradesh,2010,405,425,405,0.0,0,0,0,0,0,0,0,0.9737,-0.2793,-1.2148,-0.8778,0.5024
1,214964,315000.0,8/1/2011 0:00,7/1/2012 0:00,senior software engineer,senior software engineer,Engineering,Mumbai,f,12/25/1988 0:00,88.4,cbse,2006,79.2,cbse,332,1,B.Tech/B.E.,biomedical engineering,69.54,332,0,Punjab,2011,525,705,524,0.0,0,0,0,0,0,0,0,0.9737,-0.1232,-1.0697,1.5899,0.9763
2,900218,420000.0,7/1/2010 0:00,8/1/2011 0:00,management trainee,trainee manager,Management,NCR,m,1/11/1988 0:00,82.0,cbse,2005,75.8,cbse,443,1,M.Tech./M.E.,other,82.5,443,0,Uttar Pradesh,2014,625,520,705,0.0,0,0,0,0,0,0,0,-1.8825,0.5454,-0.9122,-0.1076,-1.4356
3,220655,400000.0,6/1/2011 0:00,3/1/2013 0:00,team leader,technical lead,Management,Chandigarh,m,11/16/1989 0:00,88.2,cbse,2007,77.8,cbse,285,1,B.Tech/B.E.,chemical engineering,62.4,285,0,Punjab,2011,595,695,575,0.0,0,0,0,0,0,0,0,-1.0355,0.9688,-0.6343,-0.4078,0.0284
4,962376,700000.0,7/1/2014 0:00,,product manager,product manager,Management,Jaipur,m,1/3/1992 0:00,87.4,cbse,2009,74.0,state board,436,1,M.Tech./M.E.,other,72.79,436,0,Uttarakhand,2014,625,450,655,0.0,0,0,0,0,0,0,0,-0.3027,-0.4536,-0.6048,-0.8682,0.0973


#### Data Cleansing

In [105]:
df = df_main.copy()
# Drop irrelevant columns
df.drop(columns=['ID', 'DOJ', 'DOL','Original Designation','Designation', 'DomainScore'], axis=1, inplace=True)

# Trim all string inputs
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Encoding Gender as Binary (0 = Male, 1 = Female)
df['Gender'] = df['Gender'].apply(lambda x: 1 if x.lower() == 'f' else 0)

today = datetime.now()

# Covering DOB to Age
df['Age'] = df['DOB'].apply(lambda x: np.round((today - datetime.strptime(x, '%m/%d/%Y %H:%M')).days / 365),0)

In [106]:
cols = ['ComputerProgramming', 'ElectronicsAndSemicon', 'ComputerScience', 'MechanicalEngg', 'ElectricalEngg', 'TelecomEngg', 'CivilEngg']
for col in cols:
    df[col] = df[col].apply(lambda x: np.nan if x == 0 else x/900)

df['SecScore'] = df[cols].mean(axis=1)
df['SecScore'] = df['SecScore'].apply(lambda x: -1 if math.isnan(x) else x)

for col in cols:
    df[col] = df[col].apply(lambda x: 0 if math.isnan(x) else 1)


In [107]:
df = df.rename(columns={'conscientiousness': 'Conscientiousness',
                        'agreeableness': 'Agreeableness',
                        'extraversion': 'Extraversion',
                        'nueroticism': 'Nueroticism',
                        'openess_to_experience': 'OTE'})

df = pd.concat([df, pd.get_dummies(df['Degree'])], axis=1)
df = pd.concat([df, pd.get_dummies(df['CollegeState'])], axis=1)

df = df[df.JobCity.isin(list(df.groupby(['JobCity']).size().nlargest(20).reset_index(drop=False)['JobCity']))]
df = pd.concat([df, pd.get_dummies(df['JobCity'])], axis=1)

In [109]:
df

Unnamed: 0,Salary,Domain,JobCity,Gender,DOB,10percentage,10board,12graduation,12percentage,12board,CollegeID,CollegeTier,Degree,Specialization,collegeGPA,CollegeCityID,CollegeCityTier,CollegeState,GraduationYear,English,Logical,Quant,ComputerProgramming,ElectronicsAndSemicon,ComputerScience,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,Conscientiousness,Agreeableness,Extraversion,Nueroticism,OTE,Age,SecScore,B.Tech/B.E.,M.Sc. (Tech.),M.Tech./M.E.,MCA,Andhra Pradesh,Assam,Bihar,Chhattisgarh,Delhi,Goa,Gujarat,Haryana,Himachal Pradesh,Jammu and Kashmir,Jharkhand,Karnataka,Kerala,Madhya Pradesh,Maharashtra,Meghalaya,Orissa,Punjab,Rajasthan,Sikkim,Tamil Nadu,Telangana,Union Territory,Uttar Pradesh,Uttarakhand,West Bengal,Ahmedabad,Banagalore,Bhopal,Bhubaneshwar,Chandigarh,Chennai,Cochin,Coimbatore,Dehradun,Hyderabad,Indore,Jaipur,Kolkata,Lucknow,Mangalore,Mumbai,Mysore,NCR,Pune,Trivandrum
0,435000.0,Engineering,Bhopal,1,10/23/1987 0:00,65.00,cbse,2005,54.00,cbse,402,1,B.Tech/B.E.,biomedical engineering,65.90,402,0,Madhya Pradesh,2010,405,425,405,0,0,0,0,0,0,0,0.9737,-0.2793,-1.2148,-0.8778,0.5024,30,-1.000000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,315000.0,Engineering,Mumbai,1,12/25/1988 0:00,88.40,cbse,2006,79.20,cbse,332,1,B.Tech/B.E.,biomedical engineering,69.54,332,0,Punjab,2011,525,705,524,0,0,0,0,0,0,0,0.9737,-0.1232,-1.0697,1.5899,0.9763,29,-1.000000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,420000.0,Management,NCR,0,1/11/1988 0:00,82.00,cbse,2005,75.80,cbse,443,1,M.Tech./M.E.,other,82.50,443,0,Uttar Pradesh,2014,625,520,705,0,0,0,0,0,0,0,-1.8825,0.5454,-0.9122,-0.1076,-1.4356,30,-1.000000,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,400000.0,Management,Chandigarh,0,11/16/1989 0:00,88.20,cbse,2007,77.80,cbse,285,1,B.Tech/B.E.,chemical engineering,62.40,285,0,Punjab,2011,595,695,575,0,0,0,0,0,0,0,-1.0355,0.9688,-0.6343,-0.4078,0.0284,28,-1.000000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,700000.0,Management,Jaipur,0,1/3/1992 0:00,87.40,cbse,2009,74.00,state board,436,1,M.Tech./M.E.,other,72.79,436,0,Uttarakhand,2014,625,450,655,0,0,0,0,0,0,0,-0.3027,-0.4536,-0.6048,-0.8682,0.0973,26,-1.000000,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
6,720000.0,Tech,NCR,0,8/24/1991 0:00,87.80,icse,2008,74.40,cbse,285,1,B.Tech/B.E.,civil engineering,74.70,285,0,Punjab,2013,545,545,590,0,0,0,0,0,0,1,-1.1644,0.3789,0.3174,1.1601,-0.2859,26,0.555556,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
7,600000.0,Engineering,NCR,0,7/1/1992 0:00,92.00,cbse,2009,90.20,state board,436,1,B.Tech/B.E.,civil engineering,77.78,436,0,Uttarakhand,2013,730,655,780,0,0,0,0,0,0,1,0.8463,0.8784,0.4711,-2.0092,-0.2859,25,0.431111,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
8,315000.0,Engineering,Hyderabad,1,6/4/1993 0:00,87.00,state board,2010,89.00,state board,12949,1,B.Tech/B.E.,electronics engineering,80.00,12949,1,Telangana,2014,640,525,545,0,0,0,0,0,0,0,1.1336,1.0449,0.4711,-1.6289,0.8637,25,-1.000000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9,300000.0,Management,Pune,1,5/10/1992 0:00,93.38,state board,2010,78.50,state board,11973,1,B.Tech/B.E.,civil engineering,65.70,11973,1,Maharashtra,2014,545,605,680,0,0,0,0,0,0,1,0.2718,-0.4536,0.4711,-1.1218,0.0973,26,0.431111,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
10,730000.0,Engineering,Pune,0,11/15/1989 0:00,84.67,,2006,86.46,,264,1,B.Tech/B.E.,chemical engineering,85.80,264,1,Rajasthan,2011,545,525,715,0,0,0,0,0,0,0,-1.3447,-1.0593,0.6720,1.0024,-1.7093,28,-1.000000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [111]:
df['Age']

0       30
1       29
2       30
3       28
4       26
6       26
7       25
8       25
9       26
10      28
11      26
13      27
14      31
15      24
17      28
18      28
20      26
21      26
22      29
23      25
24      32
25      28
27      27
29      29
30      26
31      27
34      28
35      25
36      30
37      26
        ..
3950    26
3952    28
3953    25
3954    26
3955    24
3956    26
3957    27
3958    27
3959    26
3960    25
3961    26
3962    27
3963    25
3966    25
3967    27
3968    26
3969    26
3970    25
3972    25
3973    27
3974    26
3975    25
3977    25
3978    26
3979    25
3980    25
3981    26
3982    27
3984    28
3987    24
Name: Age, Length: 3252, dtype: object

In [44]:
c.days/365

30.136986301369863