In [87]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.utils import resample

### Read in Dataset and Preprocess

In [5]:
students_df = pd.read_csv("train.csv", parse_dates=[2,3,9])

In [6]:
students_df.head()

Unnamed: 0,ID,Salary,DOJ,DOL,Original Designation,Designation,Domain,JobCity,Gender,DOB,10percentage,10board,12graduation,12percentage,12board,CollegeID,CollegeTier,Degree,Specialization,collegeGPA,CollegeCityID,CollegeCityTier,CollegeState,GraduationYear,English,Logical,Quant,Domain.1,ComputerProgramming,ElectronicsAndSemicon,ComputerScience,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience
0,308499,85000.0,2012-06-01,4/1/13 0:00,account executive,account executive,Business,Trivandrum,f,1988-08-02,77.0,cbse,2007,70.0,cbse,5442,2,B.Tech/B.E.,electronics engineering,69.34,5442,0,Kerala,2011,505,445,445,0.694479,365,366,-1,-1,-1,366,-1,0.9737,-0.9033,0.0914,-1.8179,-1.2354
1,622548,300000.0,2013-09-01,4/1/15 0:00,account executive,account executive,Business,NCR,f,1989-09-26,72.0,icse,2008,62.0,cbse,272,2,B.Tech/B.E.,computer science,72.18,272,0,Punjab,2012,500,470,445,-1.0,-1,-1,-1,-1,-1,-1,-1,0.1282,-0.4536,-0.1437,-0.6147,-0.4776
2,454921,320000.0,2013-08-01,present,account executive,account executive,Business,NCR,m,1988-10-30,82.0,,2008,60.0,,272,2,B.Tech/B.E.,information technology,80.0,272,0,Punjab,2012,585,525,475,0.735796,475,-1,-1,-1,-1,-1,-1,-0.1082,0.3448,-0.9245,-0.7603,-0.7615
3,302687,445000.0,2011-09-01,present,account executive,account executive,Business,Mumbai,m,1989-07-28,79.86,state board,2007,78.16,state board,1950,2,B.Tech/B.E.,electronics engineering,58.26,1950,0,Maharashtra,2011,455,445,455,0.229482,-1,266,-1,-1,-1,260,-1,-2.1175,-1.6833,1.1074,1.0024,-0.2875
4,669977,350000.0,2013-08-01,present,account manager,account manager,Management,NCR,m,1991-03-06,85.0,cbse,2009,76.0,cbse,9699,2,B.Tech/B.E.,computer science,71.0,9699,1,Rajasthan,2013,570,520,430,0.819417,505,-1,-1,-1,-1,-1,-1,1.5644,1.2114,0.6248,0.3995,0.8637


In [16]:
students_df['Specialization'] = students_df['Specialization'].astype('category')
students_df['SpecializationC'] = students_df['Specialization'].cat.codes

In [17]:
students_df['Degree'] = students_df['Degree'].astype('category')
students_df['DegreeC'] = students_df['Degree'].cat.codes

In [18]:
students_df['12board'] = students_df['12board'].astype('category')
students_df['12boardC'] = students_df['12board'].cat.codes

In [19]:
students_df['10board'] = students_df['10board'].astype('category')
students_df['10boardC'] = students_df['10board'].cat.codes

In [39]:
list(students_df.groupby(['JobCity']).size().nlargest(12).reset_index(drop=False)['JobCity'])

['NCR',
 'Bangalore',
 'Hyderabad',
 'Pune',
 'Chennai',
 'Mumbai',
 'Kolkata',
 'Jaipur',
 'Chandigarh',
 'Lucknow',
 'Bhubaneshwar',
 'Mysore']

In [42]:
students_filtered_df = students_df[students_df.JobCity.isin(list(students_df.groupby(['JobCity']).size().nlargest(12).reset_index(drop=False)['JobCity']))]
                                

In [43]:
students_filtered_df['JobCity'] = students_filtered_df['JobCity'].astype('category')
students_filtered_df['JobCityC'] = students_filtered_df['JobCity'].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [44]:
students_filtered_df['Domain'] = students_filtered_df['Domain'].astype('category')
students_filtered_df['DomainC'] = students_filtered_df['Domain'].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [45]:
students_filtered_df.head()

Unnamed: 0,ID,Salary,DOJ,DOL,Original Designation,Designation,Domain,JobCity,Gender,DOB,10percentage,10board,12graduation,12percentage,12board,CollegeID,CollegeTier,Degree,Specialization,collegeGPA,CollegeCityID,CollegeCityTier,CollegeState,GraduationYear,English,Logical,Quant,Domain.1,ComputerProgramming,ElectronicsAndSemicon,ComputerScience,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,SpecializationC,DegreeC,12boardC,10boardC,JobCityC,DomainC
1,622548,300000.0,2013-09-01,4/1/15 0:00,account executive,account executive,Business,NCR,f,1989-09-26,72.0,icse,2008,62.0,cbse,272,2,B.Tech/B.E.,computer science,72.18,272,0,Punjab,2012,500,470,445,-1.0,-1,-1,-1,-1,-1,-1,-1,0.1282,-0.4536,-0.1437,-0.6147,-0.4776,4,0,0,2,10,1
2,454921,320000.0,2013-08-01,present,account executive,account executive,Business,NCR,m,1988-10-30,82.0,,2008,60.0,,272,2,B.Tech/B.E.,information technology,80.0,272,0,Punjab,2012,585,525,475,0.735796,475,-1,-1,-1,-1,-1,-1,-0.1082,0.3448,-0.9245,-0.7603,-0.7615,7,0,-1,-1,10,1
3,302687,445000.0,2011-09-01,present,account executive,account executive,Business,Mumbai,m,1989-07-28,79.86,state board,2007,78.16,state board,1950,2,B.Tech/B.E.,electronics engineering,58.26,1950,0,Maharashtra,2011,455,445,455,0.229482,-1,266,-1,-1,-1,260,-1,-2.1175,-1.6833,1.1074,1.0024,-0.2875,6,0,2,3,8,1
4,669977,350000.0,2013-08-01,present,account manager,account manager,Management,NCR,m,1991-03-06,85.0,cbse,2009,76.0,cbse,9699,2,B.Tech/B.E.,computer science,71.0,9699,1,Rajasthan,2013,570,520,430,0.819417,505,-1,-1,-1,-1,-1,-1,1.5644,1.2114,0.6248,0.3995,0.8637,4,0,0,0,10,6
8,59151,200000.0,2010-01-01,1/1/13 0:00,administrative support,admin assistant,Support,Bhubaneshwar,f,1985-06-01,68.0,state board,2002,45.0,state board,2198,2,MCA,computer application,79.0,2198,0,Maharashtra,2009,715,515,805,0.99925,745,-1,-1,-1,-1,-1,-1,0.9737,0.9688,1.5428,1.8249,1.2923,3,3,2,3,1,12


### Feature Selection

In [48]:
students_filtered_corr = pd.DataFrame(students_filtered_df.corr()['DomainC'])
students_filtered_corr.columns = ['Correlation']
students_filtered_corr['AbsCorrelation'] = students_filtered_corr['Correlation'].abs()
students_filtered_corr = students_filtered_corr.sort_values(by='AbsCorrelation', ascending=False)
students_filtered_corr.iloc[1:len(students_filtered_corr)-1, :1]

Unnamed: 0,Correlation
Quant,-0.090323
Salary,-0.085993
collegeGPA,-0.085591
MechanicalEngg,0.080048
Logical,-0.078186
10percentage,-0.07453
12percentage,-0.063702
ComputerProgramming,-0.051175
SpecializationC,0.042143
English,-0.041111


In [51]:
X = students_filtered_df[['JobCityC', 'Salary', 'Quant', 'Logical', 'English', 'collegeGPA', '10percentage', '12percentage' , 'ComputerProgramming', 'nueroticism', 'openess_to_experience', 'SpecializationC', 'DegreeC', 'CollegeTier', 'CollegeCityTier']]
Y = students_filtered_df[['DomainC']]

In [52]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [69]:
X_train['DomainC'] = Y_train['DomainC']
X_train.groupby(['DomainC']).size().nlargest(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


DomainC
3     1619
13     123
9      107
14      69
1       65
5       59
6       53
10      17
8       15
11      14
12      13
2       10
7        7
0        5
4        1
dtype: int64

In [72]:
X_train_0 = X_train[X_train.DomainC==0]
X_train_1 = X_train[X_train.DomainC==1]
X_train_2 = X_train[X_train.DomainC==2]
X_train_3 = X_train[X_train.DomainC==3]
X_train_4 = X_train[X_train.DomainC==4]
X_train_5 = X_train[X_train.DomainC==5]
X_train_6 = X_train[X_train.DomainC==6]
X_train_7 = X_train[X_train.DomainC==7]
X_train_8 = X_train[X_train.DomainC==8]
X_train_9 = X_train[X_train.DomainC==9]
X_train_10 = X_train[X_train.DomainC==10]
X_train_11 = X_train[X_train.DomainC==11]
X_train_12 = X_train[X_train.DomainC==12]
X_train_13 = X_train[X_train.DomainC==13]
X_train_14 = X_train[X_train.DomainC==14]
 
X_train_0_upsampled = resample(X_train_0, replace=True, n_samples=1619,random_state=123)
X_train_1_upsampled = resample(X_train_1, replace=True, n_samples=1619,random_state=123)
X_train_2_upsampled = resample(X_train_2, replace=True, n_samples=1619,random_state=123)
X_train_4_upsampled = resample(X_train_4, replace=True, n_samples=1619,random_state=123)
X_train_5_upsampled = resample(X_train_5, replace=True, n_samples=1619,random_state=123)
X_train_6_upsampled = resample(X_train_6, replace=True, n_samples=1619,random_state=123)
X_train_7_upsampled = resample(X_train_7, replace=True, n_samples=1619,random_state=123)
X_train_8_upsampled = resample(X_train_8, replace=True, n_samples=1619,random_state=123)
X_train_9_upsampled = resample(X_train_9, replace=True, n_samples=1619,random_state=123)
X_train_10_upsampled = resample(X_train_10, replace=True, n_samples=1619,random_state=123)
X_train_11_upsampled = resample(X_train_11, replace=True, n_samples=1619,random_state=123)
X_train_12_upsampled = resample(X_train_12, replace=True, n_samples=1619,random_state=123)
X_train_13_upsampled = resample(X_train_13, replace=True, n_samples=1619,random_state=123)
X_train_14_upsampled = resample(X_train_14, replace=True, n_samples=1619,random_state=123)
 
# Combine majority class with upsampled minority class
X_train_upsampled = pd.concat([X_train_0_upsampled, 
                               X_train_1_upsampled,
                               X_train_2_upsampled,
                               X_train_3,
                               X_train_4_upsampled,
                               X_train_5_upsampled,
                               X_train_6_upsampled,
                               X_train_7_upsampled,
                               X_train_8_upsampled,
                               X_train_9_upsampled,
                               X_train_10_upsampled,
                               X_train_11_upsampled,
                               X_train_12_upsampled,
                               X_train_13_upsampled,
                               X_train_14_upsampled])
 
# Display new class counts
X_train_upsampled.groupby(['DomainC']).size().nlargest(15)

DomainC
0     1619
1     1619
2     1619
3     1619
4     1619
5     1619
6     1619
7     1619
8     1619
9     1619
10    1619
11    1619
12    1619
13    1619
14    1619
dtype: int64

In [73]:
Y_train = X_train_upsampled['DomainC']
X_train = X_train_upsampled.drop('DomainC', axis=1)

### Modeling

In [88]:
#ovrc = OneVsRestClassifier(LinearSVC(random_state=0, class_weight='balanced'))
ovrc = OneVsRestClassifier(RandomForestClassifier())
Y_pred = ovrc.fit(X_train, Y_train).predict(X_test)

In [89]:
accuracy_score(Y_pred, Y_test)

0.67880085653104927

In [103]:
#roc_auc_score(Y_pred, Y_test)

In [90]:
pd.DataFrame(confusion_matrix(Y_pred, Y_test))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,25,8,629,1,34,25,2,13,57,9,6,4,54,28
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,4,0,1,1,0,0,2,0,0,0,0,0
6,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,5,0,0,1,0,1,1,0,0,0,1,0


In [92]:
ovrc.predict_proba(X_test)[0]

array([ 0.        ,  0.        ,  0.        ,  0.66666667,  0.        ,
        0.        ,  0.08333333,  0.        ,  0.        ,  0.16666667,
        0.        ,  0.        ,  0.08333333,  0.        ,  0.        ])

In [101]:
ovrc.decision_function(X_test)[0]

AttributeError: 'RandomForestClassifier' object has no attribute 'decision_function'

In [100]:
Y_pred

array([ 3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3, 12,  3,  3,  3,  3,  3,  3,  3,
        9,  3,  3,  3,  3,  3,  3,  3,  3,  3, 13,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3, 14,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3, 13,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  5,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  5,  3,  3,  3, 14,  3,  3,  3,  3,  3,  3,  3,  3,  3, 11,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3

In [107]:
dict( enumerate(students_filtered_df['Domain'].cat.categories) )

{0: '0',
 1: 'Business',
 2: 'Education',
 3: 'Engineering',
 4: 'Finance',
 5: 'HR',
 6: 'Management',
 7: 'Marketing',
 8: 'Operations',
 9: 'QA',
 10: 'Research',
 11: 'Sales',
 12: 'Support',
 13: 'Tech',
 14: 'UX/UI Design'}