In [53]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [54]:
import numpy as np

In [55]:
from sklearn.base import TransformerMixin

In [56]:
import pandas as pd

In [57]:
strat_train_set = pd.read_csv("Data/train.csv")

In [58]:
strat_train_set.head(10)

Unnamed: 0.1,Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,6578,22505,city_21,0.624,Male,Has relevent experience,no_enrollment,Graduate,STEM,8,100-500,Pvt Ltd,2,47,1.0
1,2189,14580,city_28,0.939,Other,No relevent experience,Full time course,High School,,3,100-500,Pvt Ltd,3,17,0.0
2,575,24981,city_45,0.89,,Has relevent experience,no_enrollment,Graduate,STEM,9,50-99,Pvt Ltd,,62,0.0
3,16167,495,city_103,0.92,Male,No relevent experience,no_enrollment,Phd,STEM,15,5000-9999,Pvt Ltd,1,31,0.0
4,11764,4883,city_103,0.92,Male,No relevent experience,no_enrollment,Primary School,,3,,Pvt Ltd,never,16,0.0
5,16604,2833,city_103,0.92,Male,Has relevent experience,Full time course,Masters,STEM,11,10000+,Pvt Ltd,2,34,0.0
6,18490,30204,city_103,0.92,Female,No relevent experience,Full time course,Graduate,STEM,4,,,1,52,1.0
7,12349,21007,city_103,0.92,,Has relevent experience,no_enrollment,Graduate,STEM,9,100-500,Pvt Ltd,1,135,0.0
8,16504,4760,city_103,0.92,Male,No relevent experience,Full time course,High School,,8,,,1,118,1.0
9,4968,27737,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,No Major,>20,,,>4,110,1.0


In [59]:
job_change = strat_train_set.drop("target", axis=1)
job_change_labels = strat_train_set["target"].copy()

In [60]:
job_change.head()

Unnamed: 0.1,Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,6578,22505,city_21,0.624,Male,Has relevent experience,no_enrollment,Graduate,STEM,8,100-500,Pvt Ltd,2,47
1,2189,14580,city_28,0.939,Other,No relevent experience,Full time course,High School,,3,100-500,Pvt Ltd,3,17
2,575,24981,city_45,0.89,,Has relevent experience,no_enrollment,Graduate,STEM,9,50-99,Pvt Ltd,,62
3,16167,495,city_103,0.92,Male,No relevent experience,no_enrollment,Phd,STEM,15,5000-9999,Pvt Ltd,1,31
4,11764,4883,city_103,0.92,Male,No relevent experience,no_enrollment,Primary School,,3,,Pvt Ltd,never,16


In [61]:
num_attribs = ['city_development_index', 'training_hours']
ord_cat_attribs = ['relevent_experience', 'enrolled_university', 'education_level', 'experience', 'company_size', 'last_new_job']
nom_cat_attribs = ['city', 'gender', 'major_discipline', 'company_type']

In [62]:
for attribs in ord_cat_attribs:
    print(job_change[attribs].unique())

['Has relevent experience' 'No relevent experience']
['no_enrollment' 'Full time course' 'Part time course' nan]
['Graduate' 'High School' 'Phd' 'Primary School' 'Masters' nan]
['8' '3' '9' '15' '11' '4' '>20' '19' '2' '6' '17' '1' '10' '7' '16' '13'
 '14' '5' '12' nan '<1' '18' '20']
['100-500' '50-99' '5000-9999' nan '10000+' '<10' '10/49' '1000-4999'
 '500-999']
['2' '3' nan '1' 'never' '>4' '4']


In [63]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy= "median")),
    ('std_scaler', StandardScaler())
])

In [64]:
class OrdToNumeric(TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        X['relevent_experience'] = X['relevent_experience'].apply(self.relevent_experience_to_numeric)
        X['enrolled_university'] = X['enrolled_university'].apply(self.enrolled_university_to_numeric)
        X['education_level'] = X['education_level'].apply(self.education_level_to_numeric)
        X['experience'] = X['experience'].apply(self.experience_to_numeric)
        X['company_size'] = X['company_size'].apply(self.company_size_to_numeric)
        X['last_new_job'] = X['last_new_job'].apply(self.last_new_job_to_numeric)
        return X

    def relevent_experience_to_numeric(self, experience):
        if experience != experience:
            return None
        if experience == "Has relevent experience":
            return 1
        return 0

    def enrolled_university_to_numeric(self, x):
        if x != x:
            return None
        if x == 'no_enrollment':
            return 0
        if x == 'Part time course':
            return 1
        if x == 'Full time course':
            return 2

    def education_level_to_numeric(self, x):
        if x != x:
            return None
        if x == "Primary School":
            return 1
        if x == "High School":
            return 2
        if x == "Graduate":
            return 3
        if x == "Masters":
            return 4
        if x == "Phd":
            return 5

    def experience_to_numeric(self, x):
        if x != x:
            return None
        if x == '>20':
            return 21
        elif x == '<1':
            return 0
        else:
            return int(x)

    def company_size_to_numeric(self, x):
        if x != x:
            return None
        if x == '<10' :
            return 0
        if x == '10/49' :
            return 1
        if x == '50-99' :
            return 2
        if x == '100-500' :
            return 3
        if x == '500-999' :
            return 4
        if x == '1000-4999' :
            return 5
        if x == '5000-9999':
            return 6
        if x == '10000+':
            return 7

    def last_new_job_to_numeric(self, x):
        if x != x:
            return None
        if x == '>4' :
            return 5
        if x == 'never' :
            return 0
        else:
            return int(x)

In [65]:
ord_pipeline = Pipeline([
    ('ord_to_numeric', OrdToNumeric()),
    ('ord_imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('ord_std_scaler', StandardScaler())
])

In [66]:
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.toarray()

In [67]:
nom_pipeline = Pipeline([
    ('nom_imputer', SimpleImputer(strategy="constant")),
    ('one_hot_encoder', OneHotEncoder()),
    ('to_array', DenseTransformer()),
    ('nom_std_scaler', StandardScaler())
])

In [68]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("ord", ord_pipeline, ord_cat_attribs),
    ("nom", nom_pipeline, nom_cat_attribs)
])

In [69]:
job_change_prepared = full_pipeline.fit_transform(job_change)

In [70]:
job_change_prepared

array([[-1.65340347, -0.30513529,  0.62370107, ..., -0.23115431,
         0.96125128, -0.68064073],
       [ 0.89589291, -0.804638  , -1.60333219, ..., -0.23115431,
         0.96125128, -0.68064073],
       [ 0.4993357 , -0.05538393,  0.62370107, ..., -0.23115431,
         0.96125128, -0.68064073],
       ...,
       [ 0.74212583,  2.54203017,  0.62370107, ..., -0.23115431,
         0.96125128, -0.68064073],
       [ 0.74212583, -0.93783872,  0.62370107, ..., -0.23115431,
         0.96125128, -0.68064073],
       [ 0.74212583, -0.95448881,  0.62370107, ..., -0.23115431,
         0.96125128, -0.68064073]])

In [71]:
job_change_prepared.shape

(11494, 149)

In [72]:
job_change_prepared[0]

array([-1.65340347, -0.30513529,  0.62370107, -0.56939694, -0.19797429,
       -0.32181227,  0.06228766,  0.01453306, -0.03614878, -0.07059621,
       -0.12506073, -0.06543199, -0.12252495, -0.53987098, -0.12541906,
       -0.06338903, -0.02799341, -0.00932789, -0.02468571, -0.11342708,
       -0.0131922 , -0.27479502, -0.05526633, -0.07883863, -0.02799341,
       -0.03733592, -0.02639129, -0.01865821, -0.01615778, -0.06199028,
       -0.03848666, -0.02086142, -0.06675982, -0.0131922 , -0.05283781,
       -0.02285354, -0.02285354, -0.05029351, -0.17903997, -0.07714492,
       -0.01615778, -0.03364975, -0.00932789, -0.03960418, -0.04941662,
       -0.0520034 , -0.03614878, -0.05446872, -0.01865821, -0.07423832,
       -0.05526633, -0.05115552, -0.02950893, -0.03364975, -0.05365939,
       -0.07364355, -0.29181948, -0.20944027, -0.08476685, -0.06199028,
       -0.01615778, -0.02468571, -0.00932789, -0.09177434, -0.02950893,
       -0.03848666, -0.01615778, -0.01615778, -0.01615778, -0.07

In [73]:
np.savetxt("Data/X_train.csv", job_change_prepared, delimiter=',')
np.savetxt("Data/y_train.csv", job_change_labels.to_numpy(), delimiter=',')

# Process test set

In [74]:
test_set = pd.read_csv("Data/test.csv")

In [75]:
test_set.head()

Unnamed: 0.1,Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,14215,6188,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,12,500-999,Pvt Ltd,1,72,0.0
1,5963,2387,city_67,0.855,Male,Has relevent experience,,Graduate,STEM,14,50-99,Pvt Ltd,2,27,0.0
2,15724,12279,city_160,0.92,Male,Has relevent experience,no_enrollment,Masters,STEM,3,50-99,,2,28,0.0
3,4423,29267,city_100,0.887,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,90,0.0
4,13675,7332,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,4,,,1,14,0.0


In [76]:
job_change_test = test_set.drop("target", axis=1)
job_change_test_labels = test_set["target"].copy()

In [77]:
job_change_test_prepared = full_pipeline.transform(job_change_test)

In [78]:
job_change_test_prepared.shape

(7664, 149)

In [79]:
job_change_test_prepared

array([[ 0.74212583,  0.11111697,  0.62370107, ..., -0.23115431,
         0.96125128, -0.68064073],
       [ 0.21608054, -0.6381371 ,  0.62370107, ..., -0.23115431,
         0.96125128, -0.68064073],
       [ 0.74212583, -0.621487  ,  0.62370107, ..., -0.23115431,
        -1.04031071,  1.46920388],
       ...,
       [-1.65340347, -0.30513529,  0.62370107, ..., -0.23115431,
        -1.04031071, -0.68064073],
       [ 0.74212583,  1.80942619, -1.60333219, ..., -0.23115431,
        -1.04031071,  1.46920388],
       [ 0.74212583, -0.65478719, -1.60333219, ..., -0.23115431,
        -1.04031071, -0.68064073]])

In [80]:
np.savetxt("Data/X_test.csv", job_change_test_prepared, delimiter=',')

In [81]:
np.savetxt("Data/y_test.csv", job_change_test_labels.to_numpy(), delimiter=',')