In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin

In [2]:
import numpy as np
import pandas as pd

In [3]:
# train_set = pd.read_csv("Data/train.csv")
# cv_set = pd.read_csv("Data/cv.csv")
# test_set = pd.read_csv("Data/test.csv")

In [4]:
train_set = pd.read_csv("Data/strat_train.csv")
cv_set = pd.read_csv("Data/strat_cv.csv")
test_set = pd.read_csv("Data/strat_test.csv")

In [5]:
train_set.head(10)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8156,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,3,,,never,8,0.0
1,7227,city_23,0.899,Male,Has relevent experience,no_enrollment,Graduate,Other,>20,<10,Pvt Ltd,3,49,0.0
2,7302,city_114,0.926,Male,Has relevent experience,no_enrollment,Graduate,STEM,5,100-500,Pvt Ltd,3,15,0.0
3,27795,city_103,0.92,Male,No relevent experience,no_enrollment,Primary School,,6,,,never,336,0.0
4,6961,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,5,10000+,Pvt Ltd,2,161,0.0
5,1902,city_162,0.767,,Has relevent experience,Full time course,Graduate,STEM,6,500-999,NGO,1,155,0.0
6,19458,city_127,0.745,Male,No relevent experience,no_enrollment,Masters,STEM,6,,,1,80,1.0
7,15163,city_160,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,10,,,2,28,0.0
8,28089,city_28,0.939,Female,No relevent experience,Part time course,Graduate,STEM,4,50-99,Pvt Ltd,1,24,0.0
9,12350,city_103,0.92,Male,No relevent experience,no_enrollment,Graduate,No Major,>20,50-99,Pvt Ltd,1,9,0.0


In [6]:
job_change = train_set.drop(["enrollee_id", "target"], axis=1)
job_change_labels = train_set["target"].copy()

In [7]:
job_change.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,3,,,never,8
1,city_23,0.899,Male,Has relevent experience,no_enrollment,Graduate,Other,>20,<10,Pvt Ltd,3,49
2,city_114,0.926,Male,Has relevent experience,no_enrollment,Graduate,STEM,5,100-500,Pvt Ltd,3,15
3,city_103,0.92,Male,No relevent experience,no_enrollment,Primary School,,6,,,never,336
4,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,5,10000+,Pvt Ltd,2,161


In [8]:
num_attribs = ['city_development_index', 'training_hours']
ord_cat_attribs = ['relevent_experience', 'enrolled_university', 'education_level', 'experience', 'company_size', 'last_new_job']
nom_cat_attribs = ['city', 'gender', 'major_discipline', 'company_type']

In [9]:
for attribs in ord_cat_attribs:
    print(job_change[attribs].unique())

['No relevent experience' 'Has relevent experience']
['Full time course' 'no_enrollment' 'Part time course' nan]
['Graduate' 'Primary School' 'Masters' nan 'Phd' 'High School']
['3' '>20' '5' '6' '10' '4' '1' '16' '2' '13' '8' '<1' '17' '11' '9' '15'
 '12' '7' '18' '19' '14' '20' nan]
[nan '<10' '100-500' '10000+' '500-999' '50-99' '1000-4999' '5000-9999'
 '10/49']
['never' '3' '2' '1' '>4' '4' nan]


In [10]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy= "median")),
    ('std_scaler', StandardScaler())
])

In [11]:
class OrdToNumeric(TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        X['relevent_experience'] = X['relevent_experience'].apply(self.relevent_experience_to_numeric)
        X['enrolled_university'] = X['enrolled_university'].apply(self.enrolled_university_to_numeric)
        X['education_level'] = X['education_level'].apply(self.education_level_to_numeric)
        X['experience'] = X['experience'].apply(self.experience_to_numeric)
        X['company_size'] = X['company_size'].apply(self.company_size_to_numeric)
        X['last_new_job'] = X['last_new_job'].apply(self.last_new_job_to_numeric)
        return X

    def relevent_experience_to_numeric(self, experience):
        if experience != experience:
            return None
        if experience == "Has relevent experience":
            return 1
        return 0

    def enrolled_university_to_numeric(self, x):
        if x != x:
            return None
        if x == 'no_enrollment':
            return 0
        if x == 'Part time course':
            return 1
        if x == 'Full time course':
            return 2

    def education_level_to_numeric(self, x):
        if x != x:
            return None
        if x == "Primary School":
            return 1
        if x == "High School":
            return 2
        if x == "Graduate":
            return 3
        if x == "Masters":
            return 4
        if x == "Phd":
            return 5

    def experience_to_numeric(self, x):
        if x != x:
            return None
        if x == '>20':
            return 21
        elif x == '<1':
            return 0
        else:
            return int(x)

    def company_size_to_numeric(self, x):
        if x != x:
            return None
        if x == '<10' :
            return 0
        if x == '10/49' :
            return 1
        if x == '50-99' :
            return 2
        if x == '100-500' :
            return 3
        if x == '500-999' :
            return 4
        if x == '1000-4999' :
            return 5
        if x == '5000-9999':
            return 6
        if x == '10000+':
            return 7

    def last_new_job_to_numeric(self, x):
        if x != x:
            return None
        if x == '>4' :
            return 5
        if x == 'never' :
            return 0
        else:
            return int(x)

In [12]:
ord_pipeline = Pipeline([
    ('ord_to_numeric', OrdToNumeric()),
    ('ord_imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('ord_std_scaler', StandardScaler())
])

In [14]:
nom_pipeline = Pipeline([
    ('nom_imputer', SimpleImputer(strategy="constant")),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore', sparse = False)),
    ('nom_std_scaler', StandardScaler())
])

In [15]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("ord", ord_pipeline, ord_cat_attribs),
    ("nom", nom_pipeline, nom_cat_attribs)
])

In [16]:
job_change_prepared = full_pipeline.fit_transform(job_change)

In [17]:
job_change_prepared

array([[-1.67239838, -0.9568655 , -1.60575708, ..., -0.23136308,
        -1.02039233,  1.45427135],
       [ 0.56402217, -0.26671559,  0.6227592 , ..., -0.23136308,
         0.98001521, -0.68762958],
       [ 0.78359801, -0.83903503,  0.6227592 , ..., -0.23136308,
         0.98001521, -0.68762958],
       ...,
       [ 0.73480338,  2.00572921,  0.6227592 , ..., -0.23136308,
        -1.02039233, -0.68762958],
       [-1.67239838, -0.41821192,  0.6227592 , ..., -0.23136308,
         0.98001521, -0.68762958],
       [ 0.65347899, -0.31721437,  0.6227592 , ..., -0.23136308,
         0.98001521, -0.68762958]])

In [18]:
job_change_prepared.shape

(11494, 148)

In [19]:
job_change_prepared[0]

array([-1.67239838, -0.9568655 , -1.60575708,  1.93018573, -0.18968192,
       -1.04974153, -0.45156447, -1.19284062, -0.03614878, -0.06870429,
       -0.12142295, -0.06338903, -0.13033857, -0.54202083, -0.13341153,
       -0.06407714, -0.02285354, -0.02086142, -0.01615778, -0.11144337,
       -0.0131922 , -0.27332729, -0.05029351, -0.08049758, -0.02799341,
       -0.03614878, -0.02950893, -0.00932789, -0.01615778, -0.06055974,
       -0.03733592, -0.02639129, -0.06407714, -0.01615778, -0.04852404,
       -0.02639129, -0.02639129, -0.0476149 , -0.17955691, -0.07482847,
       -0.0131922 , -0.03364975, -0.00932789, -0.0406912 , -0.05029351,
       -0.04175011, -0.03733592, -0.05834918, -0.01865821, -0.0777135 ,
       -0.05605274, -0.05283781, -0.02799341, -0.03733592, -0.0520034 ,
       -0.06934056, -0.29566403, -0.21214168, -0.07994832, -0.06199028,
       -0.00932789, -0.02639129, -0.00932789, -0.08528589, -0.03095057,
       -0.03492151, -0.01865821, -0.01615778, -0.02086142, -0.07

In [20]:
np.savetxt("Data/X_train.csv", job_change_prepared, delimiter=',')
np.savetxt("Data/y_train.csv", job_change_labels.to_numpy(), delimiter=',')

# Process CV & test set

In [21]:
cv_set.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,14569,city_21,0.624,Male,Has relevent experience,Full time course,Graduate,STEM,2,50-99,Pvt Ltd,2,68,1.0
1,14732,city_28,0.939,Male,No relevent experience,no_enrollment,Phd,STEM,4,,Public Sector,1,13,0.0
2,6007,city_102,0.804,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,100-500,Pvt Ltd,>4,8,0.0
3,26203,city_46,0.762,,No relevent experience,no_enrollment,Graduate,STEM,3,10/49,Pvt Ltd,2,110,1.0
4,436,city_11,0.55,,No relevent experience,Part time course,Graduate,STEM,4,,,,4,0.0


In [22]:
job_change_cv = cv_set.drop(["enrollee_id", "target"], axis=1)
job_change_cv_labels = cv_set["target"].copy()

In [23]:
job_change_cv_prepared = full_pipeline.transform(job_change_cv)

In [24]:
job_change_cv_prepared.shape

(3832, 148)

In [25]:
job_change_cv_prepared

array([[-1.67239838,  0.05310997,  0.6227592 , ..., -0.23136308,
         0.98001521, -0.68762958],
       [ 0.8893197 , -0.87270088, -1.60575708, ...,  4.32221088,
        -1.02039233, -0.68762958],
       [-0.20855947, -0.9568655 ,  0.6227592 , ..., -0.23136308,
         0.98001521, -0.68762958],
       ...,
       [ 0.75920069, -0.68753871, -1.60575708, ..., -0.23136308,
        -1.02039233,  1.45427135],
       [ 0.46643291, -0.92319965, -1.60575708, ..., -0.23136308,
        -1.02039233,  1.45427135],
       [ 0.73480338, -0.51920946,  0.6227592 , ..., -0.23136308,
        -1.02039233, -0.68762958]])

In [26]:
np.savetxt("Data/X_cv.csv", job_change_cv_prepared, delimiter=',')
np.savetxt("Data/y_cv.csv", job_change_cv_labels.to_numpy(), delimiter=',')

In [27]:
job_change_test = test_set.drop(["enrollee_id", "target"], axis=1)
job_change_test_labels = test_set["target"].copy()

In [28]:
job_change_test_prepared = full_pipeline.transform(job_change_test)

In [29]:
np.savetxt("Data/X_test.csv", job_change_test_prepared, delimiter=',')
np.savetxt("Data/y_test.csv", job_change_test_labels.to_numpy(), delimiter=',')