In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
from sklearn.base import TransformerMixin

In [3]:
import numpy as np

In [4]:
import pandas as pd

In [5]:
strat_train_set = pd.read_csv("Data/train.csv")

In [6]:
strat_train_set.head(10)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,22601,city_61,0.913,,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Funded Startup,2,304,0.0
1,32926,city_104,0.924,Female,No relevent experience,Part time course,Graduate,STEM,19,10000+,Pvt Ltd,4,81,0.0
2,11499,city_114,0.926,Male,Has relevent experience,no_enrollment,Graduate,STEM,10,<10,Pvt Ltd,2,76,0.0
3,23859,city_100,0.887,Male,Has relevent experience,Part time course,Graduate,STEM,>20,,Pvt Ltd,2,53,0.0
4,4916,city_128,0.527,Male,No relevent experience,no_enrollment,Graduate,STEM,5,,,1,92,0.0
5,20823,city_114,0.926,Male,Has relevent experience,no_enrollment,Masters,STEM,16,100-500,Pvt Ltd,>4,37,0.0
6,2363,city_75,0.939,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,1000-4999,Pvt Ltd,1,302,1.0
7,6066,city_160,0.92,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Pvt Ltd,>4,96,0.0
8,18130,city_67,0.855,Female,No relevent experience,no_enrollment,Masters,Humanities,3,50-99,Pvt Ltd,2,9,0.0
9,14848,city_101,0.558,Male,No relevent experience,Part time course,Graduate,STEM,2,10/49,Pvt Ltd,never,71,1.0


In [7]:
job_change = strat_train_set.drop("target", axis=1)
job_change_labels = strat_train_set["target"].copy()

In [8]:
job_change.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,22601,city_61,0.913,,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Funded Startup,2,304
1,32926,city_104,0.924,Female,No relevent experience,Part time course,Graduate,STEM,19,10000+,Pvt Ltd,4,81
2,11499,city_114,0.926,Male,Has relevent experience,no_enrollment,Graduate,STEM,10,<10,Pvt Ltd,2,76
3,23859,city_100,0.887,Male,Has relevent experience,Part time course,Graduate,STEM,>20,,Pvt Ltd,2,53
4,4916,city_128,0.527,Male,No relevent experience,no_enrollment,Graduate,STEM,5,,,1,92


In [9]:
num_attribs = ['city_development_index', 'training_hours']
ord_cat_attribs = ['relevent_experience', 'enrolled_university', 'education_level', 'experience', 'company_size', 'last_new_job']
nom_cat_attribs = ['city', 'gender', 'major_discipline', 'company_type']

In [10]:
for attribs in ord_cat_attribs:
    print(job_change[attribs].unique())

['Has relevent experience' 'No relevent experience']
['no_enrollment' 'Part time course' 'Full time course' nan]
['Graduate' 'Masters' 'High School' nan 'Phd' 'Primary School']
['6' '19' '10' '>20' '5' '16' '3' '2' '11' '9' '12' '8' '4' '7' '14' '<1'
 '15' '1' '18' '13' '17' '20' nan]
['50-99' '10000+' '<10' nan '100-500' '1000-4999' '10/49' '500-999'
 '5000-9999']
['2' '4' '1' '>4' 'never' '3' nan]


In [11]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy= "median")),
    ('std_scaler', StandardScaler())
])

In [12]:
class OrdToNumeric(TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        X['relevent_experience'] = X['relevent_experience'].apply(self.relevent_experience_to_numeric)
        X['enrolled_university'] = X['enrolled_university'].apply(self.enrolled_university_to_numeric)
        X['education_level'] = X['education_level'].apply(self.education_level_to_numeric)
        X['experience'] = X['experience'].apply(self.experience_to_numeric)
        X['company_size'] = X['company_size'].apply(self.company_size_to_numeric)
        X['last_new_job'] = X['last_new_job'].apply(self.last_new_job_to_numeric)
        return X

    def relevent_experience_to_numeric(self, experience):
        if experience != experience:
            return None
        if experience == "Has relevent experience":
            return 1
        return 0

    def enrolled_university_to_numeric(self, x):
        if x != x:
            return None
        if x == 'no_enrollment':
            return 0
        if x == 'Part time course':
            return 1
        if x == 'Full time course':
            return 2

    def education_level_to_numeric(self, x):
        if x != x:
            return None
        if x == "Primary School":
            return 1
        if x == "High School":
            return 2
        if x == "Graduate":
            return 3
        if x == "Masters":
            return 4
        if x == "Phd":
            return 5

    def experience_to_numeric(self, x):
        if x != x:
            return None
        if x == '>20':
            return 21
        elif x == '<1':
            return 0
        else:
            return int(x)

    def company_size_to_numeric(self, x):
        if x != x:
            return None
        if x == '<10' :
            return 0
        if x == '10/49' :
            return 1
        if x == '50-99' :
            return 2
        if x == '100-500' :
            return 3
        if x == '500-999' :
            return 4
        if x == '1000-4999' :
            return 5
        if x == '5000-9999':
            return 6
        if x == '10000+':
            return 7

    def last_new_job_to_numeric(self, x):
        if x != x:
            return None
        if x == '>4' :
            return 5
        if x == 'never' :
            return 0
        else:
            return int(x)

In [13]:
ord_pipeline = Pipeline([
    ('ord_to_numeric', OrdToNumeric()),
    ('ord_imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('ord_std_scaler', StandardScaler())
])

In [14]:
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.toarray()

In [15]:
nom_pipeline = Pipeline([
    ('nom_imputer', SimpleImputer(strategy="constant")),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
    ('to_array', DenseTransformer()),
    ('nom_std_scaler', StandardScaler())
])

In [16]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("ord", ord_pipeline, ord_cat_attribs),
    ("nom", nom_pipeline, nom_cat_attribs)
])

In [17]:
job_change_prepared = full_pipeline.fit_transform(job_change)

In [18]:
job_change_prepared

array([[ 0.68455576,  3.96026626,  0.62706628, ..., -0.22545945,
        -1.02519997, -0.69725532],
       [ 0.77366348,  0.25806456, -1.59472775, ..., -0.22545945,
         0.97541946, -0.69725532],
       [ 0.78986488,  0.17505555,  0.62706628, ..., -0.22545945,
         0.97541946, -0.69725532],
       ...,
       [-2.19119315,  0.35767537, -1.59472775, ..., -0.22545945,
        -1.02519997, -0.69725532],
       [ 0.74126067, -0.05736967,  0.62706628, ..., -0.22545945,
         0.97541946, -0.69725532],
       [ 0.47393753,  0.65650779,  0.62706628, ..., -0.22545945,
        -1.02519997, -0.69725532]])

In [19]:
job_change_prepared.shape

(11494, 146)

In [20]:
job_change_prepared[0]

array([ 6.84555765e-01,  3.96026626e+00,  6.27066281e-01, -5.69227755e-01,
       -1.90202618e-01, -6.08606696e-01, -4.58221359e-01,  2.54777696e-02,
       -3.23282244e-02, -6.67598172e-02, -1.22158656e-01, -6.54319919e-02,
       -1.25776422e-01, -5.35434218e-01, -1.24701446e-01, -6.93405552e-02,
       -2.08614210e-02, -1.86582102e-02, -2.46857149e-02, -1.19938795e-01,
       -1.31921989e-02, -2.76804005e-01, -5.20033954e-02, -7.93953688e-02,
       -2.79934068e-02, -3.73359220e-02, -2.63912881e-02, -1.86582102e-02,
       -6.12791201e-02, -3.84866641e-02, -2.63912881e-02, -6.74140937e-02,
       -1.31921989e-02, -4.76148951e-02, -2.63912881e-02, -1.86582102e-02,
       -4.66882105e-02, -1.75384073e-01, -7.82780364e-02, -1.61577809e-02,
       -3.96041761e-02, -3.61487764e-02, -4.47778542e-02, -4.17501053e-02,
       -3.49215148e-02, -5.52663347e-02, -1.86582102e-02, -7.82780364e-02,
       -6.26934947e-02, -5.11555241e-02, -2.63912881e-02, -3.36497481e-02,
       -5.20033954e-02, -

In [21]:
np.savetxt("Data/X_train.csv", job_change_prepared, delimiter=',')
np.savetxt("Data/y_train.csv", job_change_labels.to_numpy(), delimiter=',')

In [22]:
from imblearn.under_sampling import RandomUnderSampler

In [23]:
undersample = RandomUnderSampler(sampling_strategy="majority", random_state=42)

In [24]:
# fit and apply the transform
X_under, y_under = undersample.fit_resample(job_change_prepared, job_change_labels.to_numpy())

In [25]:
job_change_prepared.shape

(11494, 146)

In [26]:
X_under.shape

(5790, 146)

In [27]:
np.savetxt("Data/X_train_under.csv", X_under, delimiter=',')
np.savetxt("Data/y_train_under.csv", y_under, delimiter=',')

# Process test set

In [28]:
test_set = pd.read_csv("Data/test.csv")

In [29]:
test_set.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,6992,city_16,0.91,,Has relevent experience,no_enrollment,Graduate,STEM,6,500-999,Pvt Ltd,1,21,0.0
1,8637,city_103,0.92,Female,Has relevent experience,no_enrollment,Masters,Humanities,>20,100-500,Funded Startup,2,74,0.0
2,24729,city_104,0.924,,Has relevent experience,no_enrollment,Graduate,STEM,9,10/49,Pvt Ltd,1,94,0.0
3,10933,city_21,0.624,Male,Has relevent experience,no_enrollment,Masters,STEM,15,10000+,Pvt Ltd,1,75,0.0
4,28023,city_134,0.698,Male,No relevent experience,no_enrollment,Masters,STEM,12,500-999,NGO,1,157,0.0


In [30]:
job_change_test = test_set.drop("target", axis=1)
job_change_test_labels = test_set["target"].copy()

In [31]:
job_change_test_prepared = full_pipeline.transform(job_change_test)

In [32]:
job_change_test_prepared.shape

(3832, 146)

In [33]:
job_change_test_prepared

array([[ 0.66025366, -0.73804352,  0.62706628, ..., -0.22545945,
         0.97541946, -0.69725532],
       [ 0.74126067,  0.14185195,  0.62706628, ..., -0.22545945,
        -1.02519997, -0.69725532],
       [ 0.77366348,  0.47388798,  0.62706628, ..., -0.22545945,
         0.97541946, -0.69725532],
       ...,
       [-1.65654688, -1.02027414,  0.62706628, ..., -0.22545945,
        -1.02519997, -0.69725532],
       [-2.25599876, -1.05347775,  0.62706628, ..., -0.22545945,
         0.97541946, -0.69725532],
       [ 0.55494455, -1.00367234,  0.62706628, ..., -0.22545945,
         0.97541946, -0.69725532]])

In [34]:
np.savetxt("Data/X_test.csv", job_change_test_prepared, delimiter=',')

In [35]:
np.savetxt("Data/y_test.csv", job_change_test_labels.to_numpy(), delimiter=',')

In [36]:
cv_set = pd.read_csv("Data/cv.csv")

In [37]:
job_change_cv = cv_set.drop("target", axis=1)
job_change_cv_labels = cv_set["target"].copy()

In [38]:
job_change_cv_prepared = full_pipeline.transform(job_change_cv)

In [39]:
np.savetxt("Data/X_cv.csv", job_change_cv_prepared, delimiter=',')
np.savetxt("Data/y_cv.csv", job_change_cv_labels.to_numpy(), delimiter=',')