In [24]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [25]:
import numpy as np

In [26]:
from sklearn.base import BaseEstimator, TransformerMixin

In [27]:
import pandas as pd

In [28]:
strat_train_set = pd.read_csv("Data/train.csv")

In [29]:
strat_train_set.head(10)

Unnamed: 0.1,Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8253,20667,city_21,0.624,,Has relevent experience,no_enrollment,Masters,STEM,7,<10,Early Stage Startup,1,40,0.0
1,10067,4093,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,19,100-500,Pvt Ltd,2,48,0.0
2,1851,24146,city_21,0.624,,Has relevent experience,no_enrollment,Graduate,STEM,5,,,1,135,0.0
3,4139,28323,city_19,0.682,Male,No relevent experience,Full time course,Graduate,STEM,2,,,never,45,1.0
4,2473,16720,city_21,0.624,Male,No relevent experience,Full time course,High School,,10,,,never,68,0.0
5,4592,12719,city_100,0.887,Male,Has relevent experience,no_enrollment,High School,,4,<10,Pvt Ltd,1,45,0.0
6,18339,3143,city_16,0.91,Male,Has relevent experience,no_enrollment,Masters,STEM,17,50-99,Pvt Ltd,3,33,1.0
7,1922,31830,city_61,0.913,Male,Has relevent experience,no_enrollment,Graduate,Other,10,5000-9999,Public Sector,1,7,0.0
8,19040,6023,city_100,0.887,Male,Has relevent experience,no_enrollment,Masters,STEM,15,,Pvt Ltd,>4,250,0.0
9,12428,12597,city_28,0.939,Male,No relevent experience,no_enrollment,Graduate,Humanities,12,50-99,Public Sector,>4,22,0.0


In [30]:
job_change = strat_train_set.drop("target", axis=1)
job_change_labels = strat_train_set["target"].copy()

In [31]:
# class TransformOrdinalToNumerical(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         return
#     def 

In [32]:
def relevent_experience_to_numeric(experience):
    if experience != experience:
        return None
    if experience == "Has relevent experience":
        return 1
    return 0

def enrolled_university_to_numeric(x):
    if x != x:
        return None
    if x == 'no_enrollment':
        return 0
    if x == 'Part time course':
        return 1
    if x == 'Full time course':
        return 2

def education_level_to_numeric(x):
    if x != x:
        return None
    if x == "Primary School":
        return 1
    if x == "High School":
        return 2
    if x == "Graduate":
        return 3
    if x == "Masters":
        return 4
    if x == "Phd":
        return 5

def experience_to_numeric(x):
    if x != x:
        return None
    if x == '>20':
        return 21
    elif x == '<1':
        return 0
    else:
        return int(x)

def company_size_to_numeric(x):
    if x != x:
        return None
    if x == '<10' :
        return 0
    if x == '10/49' :
        return 1
    if x == '50-99' :
        return 2
    if x == '100-500' :
        return 3
    if x == '500-999' :
        return 4
    if x == '1000-4999' :
        return 5
    if x == '5000-9999':
        return 6
    if x == '10000+':
        return 7

def last_new_job_to_numeric(x):
    if x != x:
        return None
    if x == '>4' :
        return 5
    if x == 'never' :
        return 0
    else:
        return int(x)

In [33]:
job_change['relevent_experience'] = job_change['relevent_experience'].apply(relevent_experience_to_numeric)
job_change['enrolled_university'] = job_change['enrolled_university'].apply(enrolled_university_to_numeric)
job_change['education_level'] = job_change['education_level'].apply(education_level_to_numeric)
job_change['experience'] = job_change['experience'].apply(experience_to_numeric)
job_change['company_size'] = job_change['company_size'].apply(company_size_to_numeric)
job_change['last_new_job'] = job_change['last_new_job'].apply(last_new_job_to_numeric)

In [34]:
job_change.head()

Unnamed: 0.1,Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,8253,20667,city_21,0.624,,1,0.0,4.0,STEM,7.0,0.0,Early Stage Startup,1.0,40
1,10067,4093,city_103,0.92,Male,1,0.0,3.0,STEM,19.0,3.0,Pvt Ltd,2.0,48
2,1851,24146,city_21,0.624,,1,0.0,3.0,STEM,5.0,,,1.0,135
3,4139,28323,city_19,0.682,Male,0,2.0,3.0,STEM,2.0,,,0.0,45
4,2473,16720,city_21,0.624,Male,0,2.0,2.0,,10.0,,,0.0,68


In [35]:
num_attribs = ['city_development_index', 'training_hours']
ord_cat_attribs = ['relevent_experience', 'enrolled_university', 'education_level', 'experience', 'company_size', 'last_new_job']
nom_cat_attribs = ['city', 'gender', 'major_discipline', 'company_type']

In [40]:
for attribs in ord_cat_attribs:
    print(job_change[attribs].unique())

[1 0]
[ 0.  2. nan  1.]
[ 4.  3.  2. nan  5.  1.]
[ 7. 19.  5.  2. 10.  4. 17. 15. 12.  8. 21.  6. 13. 11. 16. 14. 20.  3.
  1.  9. 18.  0. nan]
[ 0.  3. nan  2.  6.  1.  7.  5.  4.]
[ 1.  2.  0.  3.  5.  4. nan]


In [36]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy= "median")),
    ('std_scaler', StandardScaler())
])

In [37]:
job_change_num = num_pipeline.fit_transform(job_change[num_attribs])

In [41]:
ord_pipeline = Pipeline([
    ('ord_imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('ord_std_scaler', StandardScaler())
])

In [42]:
job_change_ord = ord_pipeline.fit_transform(job_change[ord_cat_attribs])

In [57]:
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.toarray()

In [59]:
nom_pipeline = Pipeline([
    ('nom_imputer', SimpleImputer(strategy="constant")),
    ('one_hot_encoder', OneHotEncoder()),
    ('to_array', DenseTransformer()),
    ('nom_std_scaler', StandardScaler())
])

In [60]:
job_change_nom = nom_pipeline.fit_transform(job_change[nom_cat_attribs])

In [61]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("ord", ord_pipeline, ord_cat_attribs),
    ("nom", nom_pipeline, nom_cat_attribs)
])

In [62]:
job_change_prepared = full_pipeline.fit_transform(job_change)

In [63]:
job_change_prepared

array([[-1.66129974, -0.42229418,  0.62190287, ..., -0.22697421,
        -1.02890375, -0.68855816],
       [ 0.74080339, -0.28876006,  0.62190287, ..., -0.22697421,
         0.97190821, -0.68855816],
       [-1.66129974,  1.16342347,  0.62190287, ..., -0.22697421,
        -1.02890375,  1.45231014],
       ...,
       [ 0.52169263, -0.13853418,  0.62190287, ..., -0.22697421,
        -1.02890375, -0.68855816],
       [-1.66129974, -0.82289653, -1.60796813, ..., -0.22697421,
        -1.02890375,  1.45231014],
       [ 0.30258187, -0.25537653,  0.62190287, ..., -0.22697421,
         0.97190821, -0.68855816]])