In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import joblib
from sklearn.metrics import roc_curve, auc, accuracy_score, f1_score, roc_auc_score

In [2]:
df = pd.read_csv('data/raw/census_income.csv')
df.head(1)

Unnamed: 0,id,age,sex,race,marital_status,relationship_label,relationship,functional_weight,education,education_num,workclass,occupation,capital_gain,capital_loss,hours_per_week,native_country,country_name,target
0,1623,17,Male,White,Married-civ-spouse,0,Husband,221129,9th,5,Private,Other-service,0,0,40,39,United-States,<=50K


In [3]:
columns_to_drop = ['id', 'relationship', 'education_num', 'country_name']
df.drop(columns_to_drop, axis = 1, inplace = True)
df.head(1)

Unnamed: 0,age,sex,race,marital_status,relationship_label,functional_weight,education,workclass,occupation,capital_gain,capital_loss,hours_per_week,native_country,target
0,17,Male,White,Married-civ-spouse,0,221129,9th,Private,Other-service,0,0,40,39,<=50K


In [4]:
# LabelEncoder for 'TARGET' column
le = LabelEncoder()
df['target'] = le.fit_transform(df['target'])
df.head(1)

Unnamed: 0,age,sex,race,marital_status,relationship_label,functional_weight,education,workclass,occupation,capital_gain,capital_loss,hours_per_week,native_country,target
0,17,Male,White,Married-civ-spouse,0,221129,9th,Private,Other-service,0,0,40,39,0


### Divide raw data on two sets for Train and Validation model pipeline

In [5]:
# 'data' use for train,  'validation_data' use for validatoin models 

data, validation_data = train_test_split( df, test_size=0.01, random_state=42) 

In [6]:
# Create file
data.to_csv('raw_data_for_pipeline.csv', index=False) # save as csv file
validation_data.to_csv('raw_data_for_validation_pipeline.csv', index=False)

###  Data for Pipeline

In [7]:
X_df = data.drop(['target'], axis = 1)
y_df = data['target']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_df,  y_df,
    test_size=0.20,
    random_state=42
)  

In [9]:
# #Separate categorical and numberical columns
# CATEGORICAL_FEATURES = X_train.dtypes[X_train.dtypes == 'object']
# NUMERICAL_FEATURES = X_train.dtypes[X_train.dtypes != 'object']

## PipeLine + 

In [10]:

CATEGORICAL_FEATURES = ['sex', 'race', 'marital_status', 'education', 'workclass', 'occupation']
NUMERICAL_FEATURES = ['age', 'relationship_label','functional_weight','capital_gain',
                      'capital_loss','hours_per_week','native_country']
    
preprocessor = ColumnTransformer(
        [
            ('num_features', RobustScaler(), NUMERICAL_FEATURES),
            ('categ_features', OneHotEncoder(), CATEGORICAL_FEATURES)
        ], 
        remainder='drop'
    )

steps = [
            ('data_scaler', preprocessor), 
            ('clf', XGBClassifier())
    ]

pipe = Pipeline(steps)
print(pipe)

# # now we can save the whole model to pkl for future usage in production web-service

# import joblib
# MODEL_FILEPATH = "models_repo/model.pkl"

# # save model
# joblib.dump(model, MODEL_FILEPATH)

# # load saved model
# model = joblib.load(MODEL_FILEPATH)



Pipeline(memory=None,
         steps=[('data_scaler',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num_features',
                                                  RobustScaler(copy=True,
                                                               quantile_range=(25.0,
                                                                               75.0),
                                                               with_centering=True,
                                                               with_scaling=True),
                                                  ['age', 'relationship_label',
                                                   'functional_weight',
                                                   'capital_gain',
                                                   'capital_loss'

### Save Pipeline_Model as 'pkl' file.

In [11]:
# fit the whole model pipeline
pipe.fit(X_train, y_train)

import joblib

#MODEL_FILEPATH = "my_model_pipe.pkl"
MODEL_FILEPATH = "[2020-04-15]-census_income_clf-[with OneHot].v1.pkl"

# save model to pkl
joblib.dump(pipe, MODEL_FILEPATH)

# load model from pkl
model = joblib.load(MODEL_FILEPATH)

In [12]:
# predict new test dataset

y_test_pipe_pred = model.predict(X_test)

y_test_pipe_prob_pred = model.predict_proba(X_test)

print(" PipeLine XGBoost classifier:")
print('------------------------------')
print(f" - ROC AUC _score: {roc_auc_score(y_test, y_test_pipe_prob_pred[:,1]): .3f}")
print('--------------------------')
print(f" - accuracy_score: {accuracy_score(y_test, y_test_pipe_pred): .3f}")
print(f" - f1_score: {f1_score(y_test, y_test_pipe_pred): .3f}")

 PipeLine XGBoost classifier:
------------------------------
 - ROC AUC _score:  0.913
--------------------------
 - accuracy_score:  0.854
 - f1_score:  0.656
