In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin

In [2]:
import numpy as np
import pandas as pd

In [3]:
# train_set = pd.read_csv("Data/train.csv")
# cv_set = pd.read_csv("Data/cv.csv")
# test_set = pd.read_csv("Data/test.csv")

In [4]:
train_set = pd.read_csv("Data/strat_train.csv")
cv_set = pd.read_csv("Data/strat_cv.csv")
test_set = pd.read_csv("Data/strat_test.csv")

In [5]:
train_set.head(10)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,3769,city_21,0.624,Male,Has relevent experience,no_enrollment,Masters,STEM,4,50-99,Pvt Ltd,1,5,0.0
1,27927,city_27,0.848,Male,No relevent experience,no_enrollment,Masters,STEM,12,,,2,17,1.0
2,22830,city_16,0.91,Male,Has relevent experience,no_enrollment,Graduate,STEM,9,100-500,Pvt Ltd,4,37,0.0
3,8987,city_138,0.836,Male,Has relevent experience,Full time course,High School,,3,50-99,Pvt Ltd,1,10,0.0
4,5958,city_71,0.884,,Has relevent experience,no_enrollment,Masters,Humanities,2,5000-9999,Pvt Ltd,1,26,0.0
5,29916,city_103,0.92,Male,No relevent experience,no_enrollment,Graduate,STEM,5,,,>4,13,1.0
6,4124,city_67,0.855,Male,No relevent experience,,High School,,7,,,never,18,0.0
7,25594,city_75,0.939,Male,Has relevent experience,no_enrollment,Graduate,STEM,9,,Pvt Ltd,never,80,0.0
8,364,city_16,0.91,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,1000-4999,Public Sector,>4,23,0.0
9,26775,city_16,0.91,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,100-500,Pvt Ltd,>4,114,0.0


In [6]:
job_change = train_set.drop(["enrollee_id", "target"], axis=1)
job_change_labels = train_set["target"].copy()

In [7]:
job_change.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,city_21,0.624,Male,Has relevent experience,no_enrollment,Masters,STEM,4,50-99,Pvt Ltd,1,5
1,city_27,0.848,Male,No relevent experience,no_enrollment,Masters,STEM,12,,,2,17
2,city_16,0.91,Male,Has relevent experience,no_enrollment,Graduate,STEM,9,100-500,Pvt Ltd,4,37
3,city_138,0.836,Male,Has relevent experience,Full time course,High School,,3,50-99,Pvt Ltd,1,10
4,city_71,0.884,,Has relevent experience,no_enrollment,Masters,Humanities,2,5000-9999,Pvt Ltd,1,26


In [8]:
num_attribs = ['city_development_index', 'training_hours']
ord_cat_attribs = ['relevent_experience', 'enrolled_university', 'education_level', 'experience', 'company_size', 'last_new_job']
nom_cat_attribs = ['city', 'gender', 'major_discipline', 'company_type']

In [9]:
for attribs in ord_cat_attribs:
    print(job_change[attribs].unique())

['Has relevent experience' 'No relevent experience']
['no_enrollment' 'Full time course' nan 'Part time course']
['Masters' 'Graduate' 'High School' 'Phd' nan 'Primary School']
['4' '12' '9' '3' '2' '5' '7' '>20' '19' '15' '16' '11' '10' '<1' '6' '13'
 '8' '1' '18' '14' '20' '17' nan]
['50-99' nan '100-500' '5000-9999' '1000-4999' '<10' '10000+' '500-999'
 '10/49']
['1' '2' '4' '>4' 'never' '3' nan]


In [10]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy= "median")),
    ('std_scaler', StandardScaler())
])

In [11]:
class OrdToNumeric(TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        X['relevent_experience'] = X['relevent_experience'].apply(self.relevent_experience_to_numeric)
        X['enrolled_university'] = X['enrolled_university'].apply(self.enrolled_university_to_numeric)
        X['education_level'] = X['education_level'].apply(self.education_level_to_numeric)
        X['experience'] = X['experience'].apply(self.experience_to_numeric)
        X['company_size'] = X['company_size'].apply(self.company_size_to_numeric)
        X['last_new_job'] = X['last_new_job'].apply(self.last_new_job_to_numeric)
        return X

    def relevent_experience_to_numeric(self, experience):
        if experience != experience:
            return None
        if experience == "Has relevent experience":
            return 1
        return 0

    def enrolled_university_to_numeric(self, x):
        if x != x:
            return None
        if x == 'no_enrollment':
            return 0
        if x == 'Part time course':
            return 1
        if x == 'Full time course':
            return 2

    def education_level_to_numeric(self, x):
        if x != x:
            return None
        if x == "Primary School":
            return 1
        if x == "High School":
            return 2
        if x == "Graduate":
            return 3
        if x == "Masters":
            return 4
        if x == "Phd":
            return 5

    def experience_to_numeric(self, x):
        if x != x:
            return None
        if x == '>20':
            return 21
        elif x == '<1':
            return 0
        else:
            return int(x)

    def company_size_to_numeric(self, x):
        if x != x:
            return None
        if x == '<10' :
            return 0
        if x == '10/49' :
            return 1
        if x == '50-99' :
            return 2
        if x == '100-500' :
            return 3
        if x == '500-999' :
            return 4
        if x == '1000-4999' :
            return 5
        if x == '5000-9999':
            return 6
        if x == '10000+':
            return 7

    def last_new_job_to_numeric(self, x):
        if x != x:
            return None
        if x == '>4' :
            return 5
        if x == 'never' :
            return 0
        else:
            return int(x)

In [12]:
ord_pipeline = Pipeline([
    ('ord_to_numeric', OrdToNumeric()),
    ('ord_imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('ord_std_scaler', StandardScaler())
])

In [13]:
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.toarray()

In [14]:
nom_pipeline = Pipeline([
    ('nom_imputer', SimpleImputer(strategy="constant")),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
    ('to_array', DenseTransformer()),
    ('nom_std_scaler', StandardScaler())
])

In [15]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("ord", ord_pipeline, ord_cat_attribs),
    ("nom", nom_pipeline, nom_cat_attribs)
])

In [16]:
job_change_prepared = full_pipeline.fit_transform(job_change)

In [17]:
job_change_prepared

array([[-1.64652041, -1.01234952,  0.6254507 , ..., -0.22905834,
         0.99064753, -0.69615321],
       [ 0.16373428, -0.81131152, -1.59884705, ..., -0.22905834,
        -1.00944077,  1.4364654 ],
       [ 0.66478691, -0.47624819,  0.6254507 , ..., -0.22905834,
         0.99064753, -0.69615321],
       ...,
       [ 0.47891255, -1.02910269, -1.59884705, ..., -0.22905834,
         0.99064753, -0.69615321],
       [-1.17779375, -0.57676719,  0.6254507 , ..., -0.22905834,
        -1.00944077,  1.4364654 ],
       [ 0.74560186,  1.46711915,  0.6254507 , ..., -0.22905834,
         0.99064753, -0.69615321]])

In [18]:
job_change_prepared.shape

(11494, 149)

In [19]:
job_change_prepared[0]

array([-1.64652041, -1.01234952,  0.6254507 , -0.56910299,  1.27340643,
       -0.89205111, -0.44972363, -0.58496153, -0.03733592, -0.06675982,
       -0.11919018, -0.06407714, -0.12648831, -0.54013976, -0.12684285,
       -0.05909509, -0.02086142, -0.01865821, -0.01615778, -0.11691722,
       -0.01615778, -0.27461187, -0.05115552, -0.08528589, -0.02950893,
       -0.03364975, -0.02799341, -0.01615778, -0.0131922 , -0.06543199,
       -0.03733592, -0.02285354, -0.06338903, -0.01615778, -0.04852404,
       -0.02468571, -0.02285354, -0.04668821, -0.18032988, -0.0754141 ,
       -0.01615778, -0.03492151, -0.00932789, -0.04175011, -0.05446872,
       -0.0437917 , -0.03848666, -0.05759374, -0.02285354, -0.07364355,
       -0.06127912, -0.05115552, -0.03095057, -0.03733592, -0.05115552,
       -0.06543199, -0.29217041, -0.20966649, -0.08580188, -0.05983183,
       -0.00932789, -0.02639129, -0.00932789, -0.08528589, -0.02639129,
       -0.03960418, -0.01865821, -0.00932789, -0.01865821, -0.08

In [20]:
np.savetxt("Data/X_train.csv", job_change_prepared, delimiter=',')
np.savetxt("Data/y_train.csv", job_change_labels.to_numpy(), delimiter=',')

### Undersampling

In [21]:
from imblearn.under_sampling import RandomUnderSampler

In [22]:
undersample = RandomUnderSampler(sampling_strategy="majority", random_state=42)

In [23]:
# fit and apply the transform
X_under, y_under = undersample.fit_resample(job_change_prepared, job_change_labels.to_numpy())

In [24]:
job_change_prepared.shape

(11494, 149)

In [25]:
X_under.shape

(5830, 149)

In [26]:
np.savetxt("Data/X_train_under.csv", X_under, delimiter=',')
np.savetxt("Data/y_train_under.csv", y_under, delimiter=',')

# Process test set

In [27]:
test_set.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,32080,city_11,0.55,,Has relevent experience,no_enrollment,Graduate,STEM,2,100-500,Pvt Ltd,2,46,0.0
1,2804,city_16,0.91,,No relevent experience,no_enrollment,High School,,3,,,never,9,0.0
2,2660,city_103,0.92,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,100-500,Funded Startup,>4,6,0.0
3,11999,city_36,0.893,,No relevent experience,Full time course,Graduate,STEM,3,1000-4999,Pvt Ltd,,16,1.0
4,8397,city_103,0.92,,No relevent experience,,High School,,2,,,never,46,0.0


In [28]:
job_change_test = test_set.drop(["enrollee_id", "target"], axis=1)
job_change_test_labels = test_set["target"].copy()

In [29]:
job_change_test_prepared = full_pipeline.transform(job_change_test)

In [30]:
job_change_test_prepared.shape

(3832, 149)

In [31]:
job_change_test_prepared

array([[-2.24455098, -0.32546969,  0.6254507 , ..., -0.22905834,
         0.99064753, -0.69615321],
       [ 0.66478691, -0.94533686, -1.59884705, ..., -0.22905834,
        -1.00944077,  1.4364654 ],
       [ 0.74560186, -0.99559636,  0.6254507 , ..., -0.22905834,
        -1.00944077, -0.69615321],
       ...,
       [ 0.22030474, -0.27521019,  0.6254507 , ..., -0.22905834,
         0.99064753, -0.69615321],
       [ 0.66478691,  1.24932798,  0.6254507 , ..., -0.22905834,
         0.99064753, -0.69615321],
       [-2.01018765,  0.14361898,  0.6254507 , ..., -0.22905834,
         0.99064753, -0.69615321]])

In [32]:
np.savetxt("Data/X_test.csv", job_change_test_prepared, delimiter=',')
np.savetxt("Data/y_test.csv", job_change_test_labels.to_numpy(), delimiter=',')

In [33]:
job_change_cv = cv_set.drop(["enrollee_id", "target"], axis=1)
job_change_cv_labels = cv_set["target"].copy()

In [34]:
job_change_cv_prepared = full_pipeline.transform(job_change_cv)

In [35]:
np.savetxt("Data/X_cv.csv", job_change_cv_prepared, delimiter=',')
np.savetxt("Data/y_cv.csv", job_change_cv_labels.to_numpy(), delimiter=',')