#### Imports

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

import math
import datetime
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.utils import resample

In [2]:
df_main = pd.read_csv('Data.csv')

In [3]:
df_main.head()

Unnamed: 0,ID,Salary,DOJ,DOL,Original Designation,Designation,Domain,JobCity,Gender,DOB,10percentage,10board,12graduation,12percentage,12board,CollegeID,CollegeTier,Degree,Specialization,collegeGPA,CollegeCityID,CollegeCityTier,CollegeState,GraduationYear,English,Logical,Quant,DomainScore,ComputerProgramming,ElectronicsAndSemicon,ComputerScience,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience
0,42418,435000.0,5/1/2010 0:00,6/1/2014 0:00,electrical engineer,electrical engineer,Engineering,Bhopal,f,10/23/1987 0:00,65.0,cbse,2005,54.0,cbse,402,1,B.Tech/B.E.,biomedical engineering,65.9,402,0,Madhya Pradesh,2010,405,425,405,0.0,0,0,0,0,0,0,0,0.9737,-0.2793,-1.2148,-0.8778,0.5024
1,214964,315000.0,8/1/2011 0:00,7/1/2012 0:00,senior software engineer,senior software engineer,Engineering,Mumbai,f,12/25/1988 0:00,88.4,cbse,2006,79.2,cbse,332,1,B.Tech/B.E.,biomedical engineering,69.54,332,0,Punjab,2011,525,705,524,0.0,0,0,0,0,0,0,0,0.9737,-0.1232,-1.0697,1.5899,0.9763
2,900218,420000.0,7/1/2010 0:00,8/1/2011 0:00,management trainee,trainee manager,Management,NCR,m,1/11/1988 0:00,82.0,cbse,2005,75.8,cbse,443,1,M.Tech./M.E.,other,82.5,443,0,Uttar Pradesh,2014,625,520,705,0.0,0,0,0,0,0,0,0,-1.8825,0.5454,-0.9122,-0.1076,-1.4356
3,220655,400000.0,6/1/2011 0:00,3/1/2013 0:00,team leader,technical lead,Management,Chandigarh,m,11/16/1989 0:00,88.2,cbse,2007,77.8,cbse,285,1,B.Tech/B.E.,chemical engineering,62.4,285,0,Punjab,2011,595,695,575,0.0,0,0,0,0,0,0,0,-1.0355,0.9688,-0.6343,-0.4078,0.0284
4,962376,700000.0,7/1/2014 0:00,,product manager,product manager,Management,Jaipur,m,1/3/1992 0:00,87.4,cbse,2009,74.0,state board,436,1,M.Tech./M.E.,other,72.79,436,0,Uttarakhand,2014,625,450,655,0.0,0,0,0,0,0,0,0,-0.3027,-0.4536,-0.6048,-0.8682,0.0973


#### Data Cleansing

In [5]:
df = df_main.copy()
# Drop irrelevant columns
df.drop(['ID', 'DOJ', 'DOL','Original Designation','Designation', 'DomainScore'], axis=1, inplace=True)

# Trim all string inputs
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Encoding Gender as Binary (0 = Male, 1 = Female)
df['Gender'] = df['Gender'].apply(lambda x: 1 if x.lower() == 'f' else 0)

today = datetime.now()

# Covering DOB to Age
df['Age'] = df['DOB'].apply(lambda x: np.round((today - datetime.strptime(x, '%m/%d/%Y %H:%M')).days / 365),0)

NameError: name 'datetime' is not defined

In [138]:
cols = ['ComputerProgramming', 'ElectronicsAndSemicon', 'ComputerScience', 'MechanicalEngg', 'ElectricalEngg', 'TelecomEngg', 'CivilEngg']
for col in cols:
    df[col] = df[col].apply(lambda x: np.nan if x == 0 else x/900)

df['SecScore'] = df[cols].mean(axis=1)
df['SecScore'] = df['SecScore'].apply(lambda x: -1 if math.isnan(x) else x)

for col in cols:
    df[col] = df[col].apply(lambda x: 0 if math.isnan(x) else 1)


In [139]:
# Rename columns
df = df.rename(columns={'conscientiousness': 'Conscientiousness',
                        'agreeableness': 'Agreeableness',
                        'extraversion': 'Extraversion',
                        'nueroticism': 'Nueroticism',
                        'openess_to_experience': 'OTE'})

# Create 1-hot features for categorical variables
df = pd.concat([df, pd.get_dummies(df['Degree'])], axis=1)
df = pd.concat([df, pd.get_dummies(df['CollegeState'])], axis=1)

# Filter for top 20 cities based on the number of records. Some cities have only 1 record and this is done to limit the outliers
df = df[df.JobCity.isin(list(df.groupby(['JobCity']).size().nlargest(20).reset_index(drop=False)['JobCity']))]
df = pd.concat([df, pd.get_dummies(df['JobCity'])], axis=1)

# Drop irrelevant columns
df.drop(columns=['JobCity', 'DOB', 'Degree', 'CollegeState', '10percentage', '10board', '12graduation', '12percentage', '12board', 'CollegeID', 'CollegeCityID', 'CollegeCityTier'], axis=1, inplace=True)

# Create EECS as a combined specialization
df['Specialization'] = df['Specialization'].apply(lambda x: 'Other Specialization' if x.lower() == 'other' else x)
df['Specialization'] = df['Specialization'].apply(lambda x: 'Electrical Engineering & Computer Science' if x in ['electronics engineering', 'electrical engineering', 'computer science'] else x)
df['Specialization'] = df['Specialization'].apply(lambda x: x.title())
df = pd.concat([df, pd.get_dummies(df['Specialization'])], axis=1)
df = pd.concat([df, pd.get_dummies(df['Domain'])], axis=1)

df.drop(columns=['Specialization', 'Domain'], axis=1, inplace=True)

In [147]:
# Create salary buckets and drop records for salary less than 1L
X = d
X = df[df.Salary >= 100000]
y = X['Salary']
X.drop('Salary', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


3170

In [44]:
c.days/365

30.136986301369863