In [21]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import cross_validate, GridSearchCV

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

# Import data

In [2]:
train_df = pd.read_csv('../data/processed/training_data.csv', index_col = 0)
target = train_df.salary
train_df.drop(columns = 'salary', inplace=True)

In [3]:
train_df

Unnamed: 0,jobType,degree,major,industry,yearsExperience,milesFromMetropolis
58414,JUNIOR,HIGH_SCHOOL,NONE,WEB,14,84
902618,SENIOR,MASTERS,COMPSCI,SERVICE,24,86
778824,JANITOR,NONE,NONE,WEB,4,86
187593,SENIOR,HIGH_SCHOOL,NONE,WEB,0,42
574438,VICE_PRESIDENT,MASTERS,BUSINESS,AUTO,16,97
...,...,...,...,...,...,...
259179,VICE_PRESIDENT,DOCTORAL,BUSINESS,OIL,9,93
365839,CFO,DOCTORAL,BUSINESS,HEALTH,15,88
131933,SENIOR,MASTERS,COMPSCI,HEALTH,18,42
671158,SENIOR,DOCTORAL,PHYSICS,SERVICE,5,23


## preprocessing / feature engineering

- drop jobId for training

**ordinal encoding**
- jobType
- degree
- industry

**one hot encoding**
-  major


In [4]:
jobtype_levels = ['JANITOR', 'JUNIOR',  'SENIOR', 'MANAGER', 'VICE_PRESIDENT', 'CFO', 'CTO', 'CEO']
degree_levels = ['NONE', 'HIGH_SCHOOL', 'BACHELORS', 'MASTERS', 'DOCTORAL']
industry_levels = ['EDUCATION', 'SERVICE', 'AUTO', 'HEALTH', 'WEB', 'FINANCE', 'OIL']

ordinal_encoder = OrdinalEncoder(categories=[jobtype_levels, degree_levels, industry_levels])
one_hot_encoder = OneHotEncoder()
scaler = StandardScaler()

preprocessing = ColumnTransformer([
    ('ordinal_encoding', ordinal_encoder, ['jobType', 'degree', 'industry']),
    ('one_hot_encoding', one_hot_encoder, ['major']),
    ('std_scaler', scaler, ['yearsExperience', 'milesFromMetropolis'])
], remainder='passthrough')
preprocessing.fit(train_df)

ColumnTransformer(remainder='passthrough',
                  transformers=[('ordinal_encoding',
                                 OrdinalEncoder(categories=[['JANITOR',
                                                             'JUNIOR', 'SENIOR',
                                                             'MANAGER',
                                                             'VICE_PRESIDENT',
                                                             'CFO', 'CTO',
                                                             'CEO'],
                                                            ['NONE',
                                                             'HIGH_SCHOOL',
                                                             'BACHELORS',
                                                             'MASTERS',
                                                             'DOCTORAL'],
                                                            ['EDUCATION',
                 

In [5]:
pd.DataFrame(preprocessing.transform(train_df))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.278354,1.193676
1,2.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.664857,1.262933
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.108150,1.262933
3,2.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.662752,-0.260733
4,4.0,3.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.555654,1.643850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999990,4.0,4.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.414898,1.505335
999991,5.0,4.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.417004,1.332191
999992,2.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.832955,-0.260733
999993,2.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.969500,-0.918680


In [87]:
feature_names = ['jobType', 'degree', 'industry'] + list(test_ct.named_transformers_['one_hot_encoding'].get_feature_names()) + ['yearsExperience', 'milesFromMetropolis']

In [85]:
list(preprocessing.named_transformers_['one_hot_encoding'].get_feature_names())

['x0_BIOLOGY',
 'x0_BUSINESS',
 'x0_CHEMISTRY',
 'x0_COMPSCI',
 'x0_ENGINEERING',
 'x0_LITERATURE',
 'x0_MATH',
 'x0_NONE',
 'x0_PHYSICS']

In [39]:
train_df

Unnamed: 0,jobType,degree,major,industry,yearsExperience,milesFromMetropolis,salary
58414,JUNIOR,HIGH_SCHOOL,NONE,WEB,14,84,80
902618,SENIOR,MASTERS,COMPSCI,SERVICE,24,86,124
778824,JANITOR,NONE,NONE,WEB,4,86,54
187593,SENIOR,HIGH_SCHOOL,NONE,WEB,0,42,80
574438,VICE_PRESIDENT,MASTERS,BUSINESS,AUTO,16,97,89
...,...,...,...,...,...,...,...
259179,VICE_PRESIDENT,DOCTORAL,BUSINESS,OIL,9,93,190
365839,CFO,DOCTORAL,BUSINESS,HEALTH,15,88,126
131933,SENIOR,MASTERS,COMPSCI,HEALTH,18,42,139
671158,SENIOR,DOCTORAL,PHYSICS,SERVICE,5,23,104


# Linear Regression

In [26]:
model_pipe = Pipeline([
    ('preprocess', preprocessing),
    ('regression', LinearRegression())
])

In [27]:
cross_validate(model_pipe, train_df, target, cv = 5, scoring = 'neg_mean_squared_error', return_train_score = True)

{'fit_time': array([1.30599189, 1.30100727, 1.40293264, 1.32897401, 1.40539503]),
 'score_time': array([0.18628788, 0.20476127, 0.21375203, 0.24072003, 0.20975614]),
 'test_score': array([-395.79117121, -398.84621756, -397.00536153, -397.38275361,
        -394.88793209]),
 'train_score': array([-397.01939713, -396.256854  , -396.71619479, -396.62184885,
        -397.24623143])}