In [23]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [24]:
train_data=pd.read_csv("train.csv")
test_data=pd.read_csv("test.csv")
print(train_data.shape)
print(test_data.shape)

(18359, 14)
(15021, 13)


In [25]:
train_data.head(5)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,23798,city_149,0.689,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,100-500,Pvt Ltd,1,106,0
1,29166,city_83,0.923,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,<10,Funded Startup,1,69,0
2,46,city_16,0.91,,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Public Sector,2,4,0
3,18527,city_64,0.666,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,50-99,Pvt Ltd,1,26,0
4,21751,city_100,0.887,,No relevent experience,no_enrollment,Masters,STEM,8,,,2,88,1


In [26]:
train_data.isnull().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                    4098
relevent_experience          0
enrolled_university        342
education_level            457
major_discipline          2838
experience                  59
company_size              4779
company_type              5039
last_new_job               367
training_hours               0
target                       0
dtype: int64

In [27]:
tot_data=pd.concat([train_data,test_data],axis=0)

In [28]:
#New column - count_nas - number of NANs in each row

tot_data['count_nas']=tot_data.isnull().sum(axis=1)

In [29]:
#New column - exp_new_col - check whether experience and gender both are NAN or not.

data=tot_data[['experience','gender']]
data=data.isnull().sum(axis=1)
data=data.tolist()

for i in range(0,len(data)):
    if data[i]==2:
        data[i]=1
    else:
        data[i]=0
        
tot_data['exp_new_col']=data

In [30]:
#New column - last_new_col - check whether last_new_job and gender both are NANs or not

data2=tot_data[['last_new_job','gender']]
data2=data2.isnull().sum(axis=1)
data2=data2.tolist()

for i in range(0,len(data2)):
    if data2[i]==2:
        data2[i]=1
    else:
        data2[i]=0
tot_data['last_new_col']=data2

In [31]:
#Fill NANs in gender as Male

tot_data['gender']=tot_data['gender'].fillna('Male')

In [32]:
#Mean encoding of City variable

city_prob=train_data.groupby('city').size().div(len(train_data))
city_new_col=train_data.groupby(['city','target']).size().div(len(train_data)).div(city_prob, axis=0, level='city')

cities=city_new_col.loc[(city_new_col.index.get_level_values('target') == 0)]
indexes=cities.index.get_level_values('city')
indexes=indexes.tolist()
city_col=tot_data['city']
new_col=[]
for city in city_col:
    ind=indexes.index(city)
    new_col.append(cities[ind])

for i in range(0,len(new_col)):
    new_col[i]=1-new_col[i]
    
tot_data.drop('city',axis=1,inplace=True)
tot_data['city_new'] =new_col

In [33]:
#Fill NAN value in enrolled_university as no_enrollment

tot_data['enrolled_university']=tot_data.enrolled_university.fillna("no_enrollment")

In [34]:
#Replacing 'High School' in education_level if major_discipline is NAN

tot_data.loc[tot_data['major_discipline'].isnull(),'education_level']='High School'

#Considering both 'Primary School' as 'High School'
tot_data.loc[tot_data['education_level']=='Primary School','education_level']='High School'

tot_data['education_level']=tot_data['education_level'].fillna('Graduate')

In [35]:
#Replacing NAN in major_discipline with 'No Major'

tot_data['major_discipline']=tot_data['major_discipline'].fillna('NaN')
discipline=tot_data['major_discipline'].tolist()
for i in range(0,len(discipline)):
    if discipline[i]=='NaN':
        discipline[i]='No Major'
        
tot_data['major_discipline']=discipline

In [36]:
tot_data['experience']=tot_data['experience'].replace('<1', 0)
tot_data['experience']=tot_data['experience'].replace('>20', 21)
tot_data['experience']=pd.to_numeric(tot_data['experience'])

#Replacing NANs in experience with mean of the variable

print(np.mean(tot_data['experience']))
tot_data['experience']=tot_data['experience'].fillna(11)

10.563963097635003


In [37]:
#New column - 'company_nas' - Number of NAN values in company_size and company_type combined

data3=tot_data[['company_size','company_type']]
num_nas=data3.isnull().sum(axis=1)
num_nas.value_counts()
tot_data['company_nas']=num_nas

In [38]:
#Replacing NAN in company_size and company_type with 'unk'

tot_data['company_size']=tot_data['company_size'].fillna('unk')
tot_data['company_type']=tot_data['company_type'].fillna('unk')

In [39]:
#If experience=0, replacing NAN in last_new_job with 'never'
#If experience=1, replacing NAN in last_new_job with '1'

tot_data.loc[(tot_data['experience']==0) & (tot_data['last_new_job'].isnull()),'last_new_job']='never'
tot_data.loc[(tot_data['experience']==1) & (tot_data['last_new_job'].isnull()),'last_new_job']='1'
tot_data['last_new_job']=tot_data['last_new_job'].fillna('1')

#Encoding last_new_job to numeric
tot_data['last_new_job'] = tot_data['last_new_job'].map({'never':0,
                                                         '1': 1,
                                                         '2': 2,
                                                        '3':3,
                                                        '4':4,
                                                        '>4':5})

In [40]:
tot_data.head()

Unnamed: 0,city_development_index,company_size,company_type,education_level,enrolled_university,enrollee_id,experience,gender,last_new_job,major_discipline,relevent_experience,target,training_hours,count_nas,exp_new_col,last_new_col,city_new,company_nas
0,0.689,100-500,Pvt Ltd,Graduate,no_enrollment,23798,3.0,Male,1,STEM,Has relevent experience,0.0,106,0,0,0,0.12381,0
1,0.923,<10,Funded Startup,Graduate,no_enrollment,29166,14.0,Male,1,STEM,Has relevent experience,0.0,69,0,0,0,0.098592,0
2,0.91,50-99,Public Sector,Graduate,no_enrollment,46,6.0,Male,2,STEM,Has relevent experience,0.0,4,1,0,0,0.093108,0
3,0.666,50-99,Pvt Ltd,Graduate,no_enrollment,18527,14.0,Male,1,STEM,Has relevent experience,0.0,26,0,0,0,0.10084,0
4,0.887,unk,unk,Masters,no_enrollment,21751,8.0,Male,2,STEM,No relevent experience,1.0,88,3,0,0,0.162362,2


In [41]:
#Encoding ordinal variables company_size and education_level

tot_data['company_size'] = tot_data['company_size'].map({'<10':0,
                                                         '10/49': 1,
                                                         '50-99': 2,
                                                        '100-500':3,
                                                        '500-999':4,
                                                        '1000-4999':5,
                                                        '5000-9999':6,
                                                        '10000+':7,
                                                        'unk':8})

tot_data['education_level'] = tot_data['education_level'].map({'High School': 1,
                                                        'Graduate':2,
                                                        'Masters':3,
                                                        'Phd':4})


In [42]:
tot_data.head()

Unnamed: 0,city_development_index,company_size,company_type,education_level,enrolled_university,enrollee_id,experience,gender,last_new_job,major_discipline,relevent_experience,target,training_hours,count_nas,exp_new_col,last_new_col,city_new,company_nas
0,0.689,3,Pvt Ltd,2,no_enrollment,23798,3.0,Male,1,STEM,Has relevent experience,0.0,106,0,0,0,0.12381,0
1,0.923,0,Funded Startup,2,no_enrollment,29166,14.0,Male,1,STEM,Has relevent experience,0.0,69,0,0,0,0.098592,0
2,0.91,2,Public Sector,2,no_enrollment,46,6.0,Male,2,STEM,Has relevent experience,0.0,4,1,0,0,0.093108,0
3,0.666,2,Pvt Ltd,2,no_enrollment,18527,14.0,Male,1,STEM,Has relevent experience,0.0,26,0,0,0,0.10084,0
4,0.887,8,unk,3,no_enrollment,21751,8.0,Male,2,STEM,No relevent experience,1.0,88,3,0,0,0.162362,2


In [43]:
#Label Encoding for all nominal variables

enc1=LabelEncoder()
enc1.fit(tot_data['gender'])
tot_data['gender']=enc1.transform(tot_data['gender'])

enc2=LabelEncoder()
enc2.fit(tot_data['relevent_experience'])
tot_data['relevent_experience']=enc2.transform(tot_data['relevent_experience'])

enc3=LabelEncoder()
enc3.fit(tot_data['enrolled_university'])
tot_data['enrolled_university']=enc3.transform(tot_data['enrolled_university'])

enc4=LabelEncoder()
enc4.fit(tot_data['major_discipline'])
tot_data['major_discipline']=enc4.transform(tot_data['major_discipline'])

enc6=LabelEncoder()
enc6.fit(tot_data['company_type'])
tot_data['company_type']=enc6.transform(tot_data['company_type'])

enc7=LabelEncoder()
enc7.fit(tot_data['exp_new_col'])
tot_data['exp_new_col']=enc7.transform(tot_data['exp_new_col'])

enc8=LabelEncoder()
enc8.fit(tot_data['exp_new_col'])
tot_data['exp_new_col']=enc8.transform(tot_data['exp_new_col'])


In [44]:
tot_data.head()

Unnamed: 0,city_development_index,company_size,company_type,education_level,enrolled_university,enrollee_id,experience,gender,last_new_job,major_discipline,relevent_experience,target,training_hours,count_nas,exp_new_col,last_new_col,city_new,company_nas
0,0.689,3,5,2,2,23798,3.0,1,1,5,0,0.0,106,0,0,0,0.12381,0
1,0.923,0,1,2,2,29166,14.0,1,1,5,0,0.0,69,0,0,0,0.098592,0
2,0.91,2,4,2,2,46,6.0,1,2,5,0,0.0,4,1,0,0,0.093108,0
3,0.666,2,5,2,2,18527,14.0,1,1,5,0,0.0,26,0,0,0,0.10084,0
4,0.887,8,6,3,2,21751,8.0,1,2,5,1,1.0,88,3,0,0,0.162362,2


In [45]:
#Total Label Encoded data to CSV

tot_data.to_csv("final_label_encoding.csv",index=False)

In [155]:
#One hot of categorical variables

one_hot=['company_type','education_level','enrolled_university','gender','major_discipline','relevent_experience']
tot_data=pd.get_dummies(columns=one_hot,data=tot_data)

#One hot encoding for nominal variables and Label Encoding for ordinal variables data to CSV
tot_data.to_csv("one_hot+label.csv",index=False)

In [160]:
#Label encoded variables to One Hot encoding

tot_data=pd.get_dummies(columns=['company_size','last_new_job','count_nas','company_nas'],data=tot_data)
tot_data.to_csv("final_one_hot.csv",index=False)