In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin

In [2]:
import numpy as np
import pandas as pd

In [3]:
# train_set = pd.read_csv("Data/train.csv")
# cv_set = pd.read_csv("Data/cv.csv")
# test_set = pd.read_csv("Data/test.csv")

In [4]:
train_set = pd.read_csv("Data/strat_train.csv")
cv_set = pd.read_csv("Data/strat_cv.csv")
test_set = pd.read_csv("Data/strat_test.csv")

In [5]:
train_set.head(10)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,27633,city_104,0.924,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,1000-4999,Pvt Ltd,3,81,0.0
1,13681,city_114,0.926,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,10/49,Pvt Ltd,>4,21,0.0
2,5164,city_114,0.926,Male,Has relevent experience,Full time course,Graduate,STEM,>20,1000-4999,NGO,>4,28,0.0
3,19363,city_75,0.939,Male,Has relevent experience,no_enrollment,Graduate,STEM,17,,,>4,46,0.0
4,30516,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,5,10/49,Pvt Ltd,>4,94,0.0
5,17774,city_21,0.624,Male,Has relevent experience,,Graduate,STEM,4,,,1,16,0.0
6,1444,city_103,0.92,Male,No relevent experience,no_enrollment,High School,,6,,,never,66,0.0
7,12113,city_11,0.55,Male,Has relevent experience,Part time course,Graduate,STEM,3,,,1,13,0.0
8,3708,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,10000+,Pvt Ltd,>4,290,0.0
9,5248,city_21,0.624,Male,No relevent experience,Full time course,High School,,5,,Pvt Ltd,never,53,0.0


In [6]:
job_change = train_set.drop(["enrollee_id", "target"], axis=1)
job_change_labels = train_set["target"].copy()

In [7]:
job_change.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,city_104,0.924,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,1000-4999,Pvt Ltd,3,81
1,city_114,0.926,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,10/49,Pvt Ltd,>4,21
2,city_114,0.926,Male,Has relevent experience,Full time course,Graduate,STEM,>20,1000-4999,NGO,>4,28
3,city_75,0.939,Male,Has relevent experience,no_enrollment,Graduate,STEM,17,,,>4,46
4,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,5,10/49,Pvt Ltd,>4,94


In [8]:
num_attribs = ['city_development_index', 'training_hours']
ord_cat_attribs = ['relevent_experience', 'enrolled_university', 'education_level', 'experience', 'company_size', 'last_new_job']
nom_cat_attribs = ['city', 'gender', 'major_discipline', 'company_type']

In [9]:
for attribs in ord_cat_attribs:
    print(job_change[attribs].unique())

['Has relevent experience' 'No relevent experience']
['no_enrollment' 'Full time course' nan 'Part time course']
['Graduate' 'High School' nan 'Masters' 'Primary School' 'Phd']
['3' '>20' '17' '5' '4' '6' '14' '11' '9' '12' '10' '2' '18' '15' '13' '8'
 '19' '7' '<1' '1' '16' '20' nan]
['1000-4999' '10/49' nan '10000+' '100-500' '50-99' '5000-9999' '<10'
 '500-999']
['3' '>4' '1' 'never' '2' '4' nan]


In [10]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy= "median")),
    ('std_scaler', StandardScaler())
])

In [11]:
class OrdToNumeric(TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        X['relevent_experience'] = X['relevent_experience'].apply(self.relevent_experience_to_numeric)
        X['enrolled_university'] = X['enrolled_university'].apply(self.enrolled_university_to_numeric)
        X['education_level'] = X['education_level'].apply(self.education_level_to_numeric)
        X['experience'] = X['experience'].apply(self.experience_to_numeric)
        X['company_size'] = X['company_size'].apply(self.company_size_to_numeric)
        X['last_new_job'] = X['last_new_job'].apply(self.last_new_job_to_numeric)
        return X

    def relevent_experience_to_numeric(self, experience):
        if experience != experience:
            return None
        if experience == "Has relevent experience":
            return 1
        return 0

    def enrolled_university_to_numeric(self, x):
        if x != x:
            return None
        if x == 'no_enrollment':
            return 0
        if x == 'Part time course':
            return 1
        if x == 'Full time course':
            return 2

    def education_level_to_numeric(self, x):
        if x != x:
            return None
        if x == "Primary School":
            return 1
        if x == "High School":
            return 2
        if x == "Graduate":
            return 3
        if x == "Masters":
            return 4
        if x == "Phd":
            return 5

    def experience_to_numeric(self, x):
        if x != x:
            return None
        if x == '>20':
            return 21
        elif x == '<1':
            return 0
        else:
            return int(x)

    def company_size_to_numeric(self, x):
        if x != x:
            return None
        if x == '<10' :
            return 0
        if x == '10/49' :
            return 1
        if x == '50-99' :
            return 2
        if x == '100-500' :
            return 3
        if x == '500-999' :
            return 4
        if x == '1000-4999' :
            return 5
        if x == '5000-9999':
            return 6
        if x == '10000+':
            return 7

    def last_new_job_to_numeric(self, x):
        if x != x:
            return None
        if x == '>4' :
            return 5
        if x == 'never' :
            return 0
        else:
            return int(x)

In [12]:
ord_pipeline = Pipeline([
    ('ord_to_numeric', OrdToNumeric()),
    ('ord_imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('ord_std_scaler', StandardScaler())
])

In [13]:
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.toarray()

In [14]:
nom_pipeline = Pipeline([
    ('nom_imputer', SimpleImputer(strategy="constant")),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
    ('to_array', DenseTransformer()),
    ('nom_std_scaler', StandardScaler())
])

In [15]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("ord", ord_pipeline, ord_cat_attribs),
    ("nom", nom_pipeline, nom_cat_attribs)
])

In [16]:
job_change_prepared = full_pipeline.fit_transform(job_change)

In [17]:
job_change_prepared

array([[ 7.73871691e-01,  2.67937771e-01,  6.20986207e-01, ...,
        -2.27637444e-01,  9.86067681e-01, -6.93824169e-01],
       [ 7.90027924e-01, -7.39113423e-01,  6.20986207e-01, ...,
        -2.27637444e-01,  9.86067681e-01, -6.93824169e-01],
       [ 7.90027924e-01, -6.21624117e-01,  6.20986207e-01, ...,
        -2.27637444e-01, -1.01412917e+00, -6.93824169e-01],
       ...,
       [ 7.41559225e-01, -2.52372013e-01,  6.20986207e-01, ...,
         4.39295040e+00, -1.01412917e+00, -6.93824169e-01],
       [ 8.95043437e-01, -6.09213950e-04,  6.20986207e-01, ...,
        -2.27637444e-01,  9.86067681e-01, -6.93824169e-01],
       [ 2.16481659e-01, -8.73386916e-01,  6.20986207e-01, ...,
        -2.27637444e-01,  9.86067681e-01, -6.93824169e-01]])

In [18]:
job_change_prepared.shape

(12260, 147)

In [19]:
job_change_prepared[0]

array([ 0.77387169,  0.26793777,  0.62098621, -0.57110826, -0.19537633,
       -1.043521  ,  1.11621768,  0.61924501, -0.03499985, -0.06136913,
       -0.11998676, -0.05932694, -0.12646752, -0.54253223,  7.82518387,
       -0.06834459, -0.02212772, -0.01564475, -0.01806573, -0.11499191,
       -0.27637437, -0.05195137, -0.08205762, -0.02857143, -0.03614917,
       -0.02857143, -0.01277336, -0.01277336, -0.06399222, -0.04142254,
       -0.02390165, -0.06953898, -0.01564475, -0.04869317, -0.02555298,
       -0.02212772, -0.04335373, -0.17541898, -0.07300721, -0.01564475,
       -0.03381169, -0.00903176, -0.03939747, -0.05195137, -0.04239905,
       -0.03726323, -0.05575973, -0.02019893, -0.07244047, -0.06069592,
       -0.0527348 , -0.02857143, -0.03726323, -0.05115608, -0.06589212,
       -0.29213001, -0.21193407, -0.08053267, -0.06136913, -0.00903176,
       -0.02390165, -0.09068453, -0.02857143, -0.0404226 , -0.01806573,
       -0.01277336, -0.01277336, -0.07468218, -0.02212772, -0.04

In [20]:
np.savetxt("Data/X_train.csv", job_change_prepared, delimiter=',')
np.savetxt("Data/y_train.csv", job_change_labels.to_numpy(), delimiter=',')

### Undersampling

In [21]:
from imblearn.under_sampling import RandomUnderSampler

In [22]:
undersample = RandomUnderSampler(sampling_strategy="majority", random_state=42)

In [23]:
# fit and apply the transform
X_under, y_under = undersample.fit_resample(job_change_prepared, job_change_labels.to_numpy())

In [24]:
job_change_prepared.shape

(12260, 147)

In [25]:
X_under.shape

(6134, 147)

In [26]:
np.savetxt("Data/X_train_under.csv", X_under, delimiter=',')
np.savetxt("Data/y_train_under.csv", y_under, delimiter=',')

# Process test set

In [27]:
test_set.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,17840,city_65,0.802,Male,Has relevent experience,Full time course,High School,,7,50-99,Pvt Ltd,1,17,0.0
1,16510,city_173,0.878,Male,Has relevent experience,no_enrollment,Graduate,STEM,10,<10,Pvt Ltd,>4,218,0.0
2,25453,city_160,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,>4,72,0.0
3,29151,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,9,100-500,Pvt Ltd,>4,276,0.0
4,27705,city_114,0.926,,Has relevent experience,no_enrollment,Masters,STEM,>20,10000+,Pvt Ltd,never,57,0.0


In [28]:
job_change_test = test_set.drop(["enrollee_id", "target"], axis=1)
job_change_test_labels = test_set["target"].copy()

In [29]:
job_change_test_prepared = full_pipeline.transform(job_change_test)

In [30]:
job_change_test_prepared.shape

(3832, 147)

In [31]:
job_change_test_prepared

array([[-2.11658510e-01, -8.06250170e-01,  6.20986207e-01, ...,
        -2.27637444e-01,  9.86067681e-01, -6.93824169e-01],
       [ 4.02278337e-01,  2.56737133e+00,  6.20986207e-01, ...,
        -2.27637444e-01,  9.86067681e-01, -6.93824169e-01],
       [ 7.41559225e-01,  1.16880092e-01,  6.20986207e-01, ...,
        -2.27637444e-01,  9.86067681e-01, -6.93824169e-01],
       ...,
       [ 7.90027924e-01, -1.18098520e-01, -1.61034172e+00, ...,
        -2.27637444e-01,  9.86067681e-01, -6.93824169e-01],
       [ 6.60778061e-01, -4.53782251e-01,  6.20986207e-01, ...,
        -2.27637444e-01,  9.86067681e-01, -6.93824169e-01],
       [-1.95502277e-01, -6.09213950e-04,  6.20986207e-01, ...,
        -2.27637444e-01, -1.01412917e+00,  1.44128735e+00]])

In [32]:
np.savetxt("Data/X_test.csv", job_change_test_prepared, delimiter=',')
np.savetxt("Data/y_test.csv", job_change_test_labels.to_numpy(), delimiter=',')

In [33]:
job_change_cv = cv_set.drop(["enrollee_id", "target"], axis=1)
job_change_cv_labels = cv_set["target"].copy()

In [34]:
job_change_cv_prepared = full_pipeline.transform(job_change_cv)

In [35]:
np.savetxt("Data/X_cv.csv", job_change_cv_prepared, delimiter=',')
np.savetxt("Data/y_cv.csv", job_change_cv_labels.to_numpy(), delimiter=',')