In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.utils import resample

### Read in Dataset and Preprocess

In [3]:
df = pd.read_csv("train.csv", parse_dates=[2,3,9])

In [5]:
df.head()

Unnamed: 0,ID,Salary,DOJ,DOL,Original Designation,Designation,Domain,JobCity,Gender,DOB,10percentage,10board,12graduation,12percentage,12board,CollegeID,CollegeTier,Degree,Specialization,collegeGPA,CollegeCityID,CollegeCityTier,CollegeState,GraduationYear,English,Logical,Quant,DomainScore,ComputerProgramming,ElectronicsAndSemicon,ComputerScience,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience
0,42418,435000.0,2010-05-01,2014-06-01,electrical engineer,electrical engineer,Engineering,Bhopal,f,1987-10-23,65.0,cbse,2005,54.0,cbse,402,1,B.Tech/B.E.,biomedical engineering,65.9,402,0,Madhya Pradesh,2010,405,425,405,0.0,0,0,0,0,0,0,0,0.9737,-0.2793,-1.2148,-0.8778,0.5024
1,214964,315000.0,2011-08-01,2012-07-01,senior software engineer,senior software engineer,Engineering,Mumbai,f,1988-12-25,88.4,cbse,2006,79.2,cbse,332,1,B.Tech/B.E.,biomedical engineering,69.54,332,0,Punjab,2011,525,705,524,0.0,0,0,0,0,0,0,0,0.9737,-0.1232,-1.0697,1.5899,0.9763
2,900218,420000.0,2010-07-01,2011-08-01,management trainee,trainee manager,Management,NCR,m,1988-01-11,82.0,cbse,2005,75.8,cbse,443,1,M.Tech./M.E.,other,82.5,443,0,Uttar Pradesh,2014,625,520,705,0.0,0,0,0,0,0,0,0,-1.8825,0.5454,-0.9122,-0.1076,-1.4356
3,220655,400000.0,2011-06-01,2013-03-01,team leader,technical lead,Management,Chandigarh,m,1989-11-16,88.2,cbse,2007,77.8,cbse,285,1,B.Tech/B.E.,chemical engineering,62.4,285,0,Punjab,2011,595,695,575,0.0,0,0,0,0,0,0,0,-1.0355,0.9688,-0.6343,-0.4078,0.0284
4,962376,700000.0,2014-07-01,NaT,product manager,product manager,Management,Jaipur,m,1992-01-03,87.4,cbse,2009,74.0,state board,436,1,M.Tech./M.E.,other,72.79,436,0,Uttarakhand,2014,625,450,655,0.0,0,0,0,0,0,0,0,-0.3027,-0.4536,-0.6048,-0.8682,0.0973


Convert categorical data into codes for ML input

In [6]:
students_df['Specialization'] = students_df['Specialization'].astype('category')
students_df['SpecializationC'] = students_df['Specialization'].cat.codes

In [5]:
students_df['Degree'] = students_df['Degree'].astype('category')
students_df['DegreeC'] = students_df['Degree'].cat.codes

In [6]:
students_df['12board'] = students_df['12board'].astype('category')
students_df['12boardC'] = students_df['12board'].cat.codes

In [7]:
students_df['10board'] = students_df['10board'].astype('category')
students_df['10boardC'] = students_df['10board'].cat.codes

In [8]:
students_df['Domain'] = students_df['Domain'].astype('category')
students_df['DomainC'] = students_df['Domain'].cat.codes

In [8]:
students_df['SpecializationC']

0       0
1       0
2       9
3       1
4       9
5       1
6       2
7       2
8       6
9       2
10      1
11      4
12      2
13      6
14      7
15      4
16      5
17      3
18      6
19      8
20      2
21      6
22      6
23      2
24      8
25      0
26      4
27      4
28      0
29      4
       ..
3958    5
3959    6
3960    5
3961    6
3962    6
3963    6
3964    6
3965    6
3966    5
3967    6
3968    5
3969    6
3970    5
3971    6
3972    6
3973    6
3974    6
3975    6
3976    5
3977    6
3978    5
3979    5
3980    5
3981    5
3982    5
3983    6
3984    6
3985    6
3986    5
3987    5
Name: SpecializationC, Length: 3988, dtype: int8

Consider cities which have enough records to train (>10)

In [9]:
students_df.groupby(['JobCity']).size().nlargest(20).reset_index(drop=False)

Unnamed: 0,JobCity,0
0,NCR,909
1,Banagalore,687
2,Hyderabad,373
3,Pune,328
4,Chennai,315
5,Mumbai,161
6,Kolkata,120
7,Jaipur,53
8,Chandigarh,44
9,Lucknow,41


In [10]:
students_filtered_df = students_df[students_df.JobCity.isin(list(students_df.groupby(['JobCity']).size().nlargest(20).reset_index(drop=False)['JobCity']))]
                                

In [11]:
print("Total records: {} New total records after filtering on JobCity: {}".format(len(students_df), len(students_filtered_df)))

Total records: 3988 New total records after filtering on JobCity: 3249


In [12]:
students_filtered_df['JobCity'] = students_filtered_df['JobCity'].astype('category')
students_filtered_df['JobCityC'] = students_filtered_df['JobCity'].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Feature Selection

In [13]:
students_filtered_corr = pd.DataFrame(students_filtered_df.corr()['Salary'])
students_filtered_corr.columns = ['Correlation']
students_filtered_corr['AbsCorrelation'] = students_filtered_corr['Correlation'].abs()
students_filtered_corr = students_filtered_corr.sort_values(by='AbsCorrelation', ascending=False)
students_filtered_corr.iloc[1:10, :1]
#students_filtered_corr.iloc[1:len(students_filtered_corr)-1, :1]

Unnamed: 0,Correlation
ID,-0.253713
Quant,0.218892
12percentage,0.194939
Logical,0.188345
CollegeTier,-0.18276
10percentage,0.179357
12graduation,-0.161261
English,0.156043
collegeGPA,0.152831


In [14]:
students_filtered_corr = pd.DataFrame(students_filtered_df.corr()['JobCityC'])
students_filtered_corr.columns = ['Correlation']
students_filtered_corr['AbsCorrelation'] = students_filtered_corr['Correlation'].abs()
students_filtered_corr = students_filtered_corr.sort_values(by='AbsCorrelation', ascending=False)
students_filtered_corr.iloc[1:10, :1]
#students_filtered_corr.iloc[1:len(students_filtered_corr)-1, :1]

Unnamed: 0,Correlation
12boardC,-0.176343
10percentage,-0.174741
10boardC,-0.160499
collegeGPA,-0.156092
CollegeCityTier,-0.155392
12percentage,-0.150196
CollegeTier,-0.08801
conscientiousness,-0.084081
ComputerProgramming,-0.081659


In [15]:
students_filtered_corr = pd.DataFrame(students_filtered_df.corr()['DomainC'])
students_filtered_corr.columns = ['Correlation']
students_filtered_corr['AbsCorrelation'] = students_filtered_corr['Correlation'].abs()
students_filtered_corr = students_filtered_corr.sort_values(by='AbsCorrelation', ascending=False)
students_filtered_corr.iloc[1:10, :1]
#students_filtered_corr.iloc[1:len(students_filtered_corr)-1, :1]

Unnamed: 0,Correlation
MechanicalEngg,0.079297
Quant,-0.079118
Salary,-0.075744
Logical,-0.071384
collegeGPA,-0.067256
SpecializationC,0.055716
ComputerProgramming,-0.046325
openess_to_experience,0.039064
CollegeCityTier,0.038137


In [24]:
X = students_filtered_df[['JobCityC', 'Salary', 'Quant', 'Logical', 'English', 'collegeGPA', '10percentage', '12percentage' , 'ComputerProgramming', 'openess_to_experience', 'conscientiousness', 'agreeableness', 'nueroticism', 'SpecializationC', 'DegreeC', 'CollegeTier']]
Y = students_filtered_df[['DomainC']]

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [26]:
X_train['DomainC'] = Y_train['DomainC']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Upsample based on Domain

In [27]:
X_train_ = list()
X_train_upsampled_ = list()
X_train_upsampled = pd.DataFrame()
max_samples = list(X_train.groupby(['DomainC']).size().nlargest(1))[0]
for i in set(X_train.DomainC):
    X_train_.append(X_train[X_train.DomainC==i])
    X_train_upsampled_.append(resample(X_train_[i], replace=True, n_samples=max_samples, random_state=123))
    X_train_upsampled = pd.concat([X_train_upsampled, X_train_upsampled_[i]])
# Display new class counts
X_train_upsampled.groupby(['DomainC']).size().nlargest(15)


DomainC
0     1556
1     1556
2     1556
3     1556
4     1556
5     1556
6     1556
7     1556
8     1556
9     1556
10    1556
11    1556
12    1556
13    1556
dtype: int64

In [28]:
Y_train = X_train_upsampled['DomainC']
X_train = X_train_upsampled.drop('DomainC', axis=1)

In [29]:
X_train.head()

Unnamed: 0,JobCityC,Salary,Quant,Logical,English,collegeGPA,10percentage,12percentage,ComputerProgramming,openess_to_experience,conscientiousness,agreeableness,nueroticism,SpecializationC,DegreeC,CollegeTier
3098,1,300000.0,575,580,580,79.0,79.0,71.0,565,-0.6692,1.2772,1.5444,-1.5021,4,0,2
1271,17,390000.0,495,655,485,82.7,82.0,70.0,525,-0.2859,0.99,1.3779,-0.2344,8,0,2
2528,17,500000.0,885,560,710,94.6,95.2,91.4,445,0.4805,0.1282,0.2124,-0.4879,5,0,1
2280,4,100000.0,380,590,500,63.0,82.0,59.0,0,-0.0943,-0.8772,0.2124,-0.7415,6,0,2
2002,18,305000.0,705,545,695,55.55,73.33,57.67,0,0.1864,-1.6538,0.6568,1.3549,6,0,1


### Modeling

In [30]:
#ovrc = OneVsRestClassifier(LinearSVC(random_state=0, class_weight='balanced'))
ovrc = OneVsRestClassifier(RandomForestClassifier())
Y_pred = ovrc.fit(X_train, Y_train).predict(X_test)

In [31]:
accuracy_score(Y_pred, Y_test)

0.5938461538461538

In [32]:
#roc_auc_score(Y_pred, Y_test)

In [33]:
pd.DataFrame(confusion_matrix(Y_pred, Y_test))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,0,6,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,23,3,570,1,15,21,3,10,42,9,4,19,82,31
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,2,0,0,0,0,0,1,0,0,0,0,0
5,2,0,5,0,0,0,0,0,1,0,0,1,1,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,1,0
8,0,0,21,0,0,0,0,1,0,0,0,1,1,4
9,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [34]:
ovrc.predict_proba(X_test)[0]

array([ 0.        ,  0.        ,  0.71428571,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.14285714,  0.        ,
        0.        ,  0.        ,  0.14285714,  0.        ])

In [35]:
ovrc.decision_function(X_test)[0]

AttributeError: 'RandomForestClassifier' object has no attribute 'decision_function'

In [36]:
Y_pred

array([ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  9, 12,  2, 12, 11,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  8,  2,  2,  2,  2,  2,  2, 12, 12,
        2,  2,  2,  2, 12,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2, 12,  2,  4,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2, 13,  2,  2,  2, 13,  2,  2,  0,  2,  2,  2,  2,  2,  2,  2, 12,
        2,  2, 12,  2,  2,  2,  2,  8,  2, 11,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  0,
        2,  2,  8,  2,  2,  2,  2,  2,  2,  2,  2, 11,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  5,  2,  2,  2,  2,  2,
        2,  2, 12,  2,  2,  2, 11,  2,  2,  2,  2,  8,  2,  2,  2,  2,  2,
        2,  2, 12,  2, 12,  2,  2, 12,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  8,  2,  2,  2,  8,  2,  2,
        2,  2,  2,  2,  2,  2,  5,  2,  2,  2,  2,  2,  2,  2,  2, 13,  2,
       13,  2,  2,  2,  2

In [37]:
dict( enumerate(students_filtered_df['Domain'].cat.categories) )

{0: 'Business',
 1: 'Education',
 2: 'Engineering',
 3: 'Finance',
 4: 'HR',
 5: 'Management',
 6: 'Marketing',
 7: 'Operations',
 8: 'QA',
 9: 'Research',
 10: 'Sales',
 11: 'Support',
 12: 'Tech',
 13: 'UX/UI Design'}