In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [27]:
import numpy as np

In [28]:
from sklearn.base import TransformerMixin

In [29]:
import pandas as pd

In [30]:
strat_train_set = pd.read_csv("Data/train.csv")

In [31]:
strat_train_set.head(10)

Unnamed: 0.1,Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8253,20667,city_21,0.624,,Has relevent experience,no_enrollment,Masters,STEM,7,<10,Early Stage Startup,1,40,0.0
1,10067,4093,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,19,100-500,Pvt Ltd,2,48,0.0
2,1851,24146,city_21,0.624,,Has relevent experience,no_enrollment,Graduate,STEM,5,,,1,135,0.0
3,4139,28323,city_19,0.682,Male,No relevent experience,Full time course,Graduate,STEM,2,,,never,45,1.0
4,2473,16720,city_21,0.624,Male,No relevent experience,Full time course,High School,,10,,,never,68,0.0
5,4592,12719,city_100,0.887,Male,Has relevent experience,no_enrollment,High School,,4,<10,Pvt Ltd,1,45,0.0
6,18339,3143,city_16,0.91,Male,Has relevent experience,no_enrollment,Masters,STEM,17,50-99,Pvt Ltd,3,33,1.0
7,1922,31830,city_61,0.913,Male,Has relevent experience,no_enrollment,Graduate,Other,10,5000-9999,Public Sector,1,7,0.0
8,19040,6023,city_100,0.887,Male,Has relevent experience,no_enrollment,Masters,STEM,15,,Pvt Ltd,>4,250,0.0
9,12428,12597,city_28,0.939,Male,No relevent experience,no_enrollment,Graduate,Humanities,12,50-99,Public Sector,>4,22,0.0


In [32]:
job_change = strat_train_set.drop("target", axis=1)
job_change_labels = strat_train_set["target"].copy()

In [33]:
job_change.head()

Unnamed: 0.1,Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,8253,20667,city_21,0.624,,Has relevent experience,no_enrollment,Masters,STEM,7,<10,Early Stage Startup,1,40
1,10067,4093,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,19,100-500,Pvt Ltd,2,48
2,1851,24146,city_21,0.624,,Has relevent experience,no_enrollment,Graduate,STEM,5,,,1,135
3,4139,28323,city_19,0.682,Male,No relevent experience,Full time course,Graduate,STEM,2,,,never,45
4,2473,16720,city_21,0.624,Male,No relevent experience,Full time course,High School,,10,,,never,68


In [34]:
num_attribs = ['city_development_index', 'training_hours']
ord_cat_attribs = ['relevent_experience', 'enrolled_university', 'education_level', 'experience', 'company_size', 'last_new_job']
nom_cat_attribs = ['city', 'gender', 'major_discipline', 'company_type']

In [35]:
for attribs in ord_cat_attribs:
    print(job_change[attribs].unique())

['Has relevent experience' 'No relevent experience']
['no_enrollment' 'Full time course' nan 'Part time course']
['Masters' 'Graduate' 'High School' nan 'Phd' 'Primary School']
['7' '19' '5' '2' '10' '4' '17' '15' '12' '8' '>20' '6' '13' '11' '16'
 '14' '20' '3' '1' '9' '18' '<1' nan]
['<10' '100-500' nan '50-99' '5000-9999' '10/49' '10000+' '1000-4999'
 '500-999']
['1' '2' 'never' '3' '>4' '4' nan]


In [36]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy= "median")),
    ('std_scaler', StandardScaler())
])

In [37]:
class OrdToNumeric(TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        X['relevent_experience'] = X['relevent_experience'].apply(self.relevent_experience_to_numeric)
        X['enrolled_university'] = X['enrolled_university'].apply(self.enrolled_university_to_numeric)
        X['education_level'] = X['education_level'].apply(self.education_level_to_numeric)
        X['experience'] = X['experience'].apply(self.experience_to_numeric)
        X['company_size'] = X['company_size'].apply(self.company_size_to_numeric)
        X['last_new_job'] = X['last_new_job'].apply(self.last_new_job_to_numeric)
        return X

    def relevent_experience_to_numeric(self, experience):
        if experience != experience:
            return None
        if experience == "Has relevent experience":
            return 1
        return 0

    def enrolled_university_to_numeric(self, x):
        if x != x:
            return None
        if x == 'no_enrollment':
            return 0
        if x == 'Part time course':
            return 1
        if x == 'Full time course':
            return 2

    def education_level_to_numeric(self, x):
        if x != x:
            return None
        if x == "Primary School":
            return 1
        if x == "High School":
            return 2
        if x == "Graduate":
            return 3
        if x == "Masters":
            return 4
        if x == "Phd":
            return 5

    def experience_to_numeric(self, x):
        if x != x:
            return None
        if x == '>20':
            return 21
        elif x == '<1':
            return 0
        else:
            return int(x)

    def company_size_to_numeric(self, x):
        if x != x:
            return None
        if x == '<10' :
            return 0
        if x == '10/49' :
            return 1
        if x == '50-99' :
            return 2
        if x == '100-500' :
            return 3
        if x == '500-999' :
            return 4
        if x == '1000-4999' :
            return 5
        if x == '5000-9999':
            return 6
        if x == '10000+':
            return 7

    def last_new_job_to_numeric(self, x):
        if x != x:
            return None
        if x == '>4' :
            return 5
        if x == 'never' :
            return 0
        else:
            return int(x)

In [38]:
ord_pipeline = Pipeline([
    ('ord_to_numeric', OrdToNumeric()),
    ('ord_imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('ord_std_scaler', StandardScaler())
])

In [39]:
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.toarray()

In [40]:
nom_pipeline = Pipeline([
    ('nom_imputer', SimpleImputer(strategy="constant")),
    ('one_hot_encoder', OneHotEncoder()),
    ('to_array', DenseTransformer()),
    ('nom_std_scaler', StandardScaler())
])

In [41]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("ord", ord_pipeline, ord_cat_attribs),
    ("nom", nom_pipeline, nom_cat_attribs)
])

In [42]:
job_change_prepared = full_pipeline.fit_transform(job_change)

In [43]:
job_change_prepared

array([[-1.66129974, -0.42229418,  0.62190287, ..., -0.22697421,
        -1.02890375, -0.68855816],
       [ 0.74080339, -0.28876006,  0.62190287, ..., -0.22697421,
         0.97190821, -0.68855816],
       [-1.66129974,  1.16342347,  0.62190287, ..., -0.22697421,
        -1.02890375,  1.45231014],
       ...,
       [ 0.52169263, -0.13853418,  0.62190287, ..., -0.22697421,
        -1.02890375, -0.68855816],
       [-1.66129974, -0.82289653, -1.60796813, ..., -0.22697421,
        -1.02890375,  1.45231014],
       [ 0.30258187, -0.25537653,  0.62190287, ..., -0.22697421,
         0.97190821, -0.68855816]])

In [44]:
job_change_prepared[0]

array([-1.66129974, -0.42229418,  0.62190287, -0.56997915,  1.26429189,
       -0.46703132, -1.50308819, -0.58755418, -0.03666178, -0.06704015,
       -0.11988396, -0.06055898, -0.12855025, -0.53521429, -0.12366795,
       -0.06590841, -0.02115721, -0.0172735 , -0.02115721, -0.11565315,
       -0.27908355, -0.05188246, -0.08219949, -0.02865234, -0.03562751,
       -0.02992751, -0.01931308, -0.0122133 , -0.0546971 , -0.03960368,
       -0.02591509, -0.07243878, -0.01495873, -0.04890794, -0.02443207,
       -0.02115721, -0.04491642, -0.18069462, -0.07891741, -0.01495873,
       -0.03346372, -0.03562751, -0.04966811, -0.04574235, -0.0376678 ,
       -0.05260008, -0.0172735 , -0.07698076, -0.05930577, -0.05115489,
       -0.02731792, -0.03346372, -0.0546971 , -0.07449087, -0.29580913,
       -0.21452702, -0.08401891, -0.06815343, -0.01495873, -0.02115721,
       -0.00863578, -0.08968444, -0.02285325, -0.03562751, -0.01495873,
       -0.00863578, -0.01931308, -0.08080879, -0.02115721, -0.03

# Process test set

In [46]:
test_set = pd.read_csv("Data/test.csv")

In [47]:
test_set.head()

Unnamed: 0.1,Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,16144,6992,city_16,0.91,,Has relevent experience,no_enrollment,Graduate,STEM,6,500-999,Pvt Ltd,1,21,0.0
1,13661,8637,city_103,0.92,Female,Has relevent experience,no_enrollment,Masters,Humanities,>20,100-500,Funded Startup,2,74,0.0
2,344,24729,city_104,0.924,,Has relevent experience,no_enrollment,Graduate,STEM,9,10/49,Pvt Ltd,1,94,0.0
3,1034,10933,city_21,0.624,Male,Has relevent experience,no_enrollment,Masters,STEM,15,10000+,Pvt Ltd,1,75,0.0
4,8109,28023,city_134,0.698,Male,No relevent experience,no_enrollment,Masters,STEM,12,500-999,NGO,1,157,0.0


In [51]:
job_change_test = test_set.drop("target", axis=1)
job_change_test_labels = test_set["target"].copy()

In [52]:
job_change_test_prepared = full_pipeline.fit_transform(job_change_test)

In [53]:
job_change_test_prepared

array([[ 0.65368449, -0.73717176,  0.62806744, ..., -0.2338352 ,
         0.98377804, -0.68260718],
       [ 0.73454606,  0.14033466,  0.62806744, ..., -0.2338352 ,
        -1.01648945, -0.68260718],
       [ 0.76689069,  0.47146916,  0.62806744, ..., -0.2338352 ,
         0.98377804, -0.68260718],
       ...,
       [ 0.65368449, -0.05834604,  0.62806744, ..., -0.2338352 ,
        -1.01648945,  1.4649714 ],
       [-0.04172502, -0.62127468,  0.62806744, ..., -0.2338352 ,
         0.98377804, -0.68260718],
       [-1.06058081, -0.53849106,  0.62806744, ..., -0.2338352 ,
         0.98377804, -0.68260718]])

In [54]:
np.savetxt("Data/X_test.csv", job_change_test_prepared, delimiter=',')

In [55]:
np.savetxt("Data/y_test.csv", job_change_test_labels.to_numpy(), delimiter=',')