# Pipeline for Regression

Build a scalable, ElasticNet regression pipeline with Gapminder dataset. Involves tuning of l1_ratio of the ElasticNet using GridSearchCV.

In [37]:
#Loading the requisite packages
#Scikit-Learn packages
from sklearn.preprocessing import scale, StandardScaler, Imputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import ElasticNet
#Computation and Visualization Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [38]:
#Loading the requisite dataset
df = pd.read_csv('gapminder_dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139 entries, 0 to 138
Data columns (total 9 columns):
population         139 non-null float64
fertility          139 non-null float64
HIV                139 non-null float64
CO2                139 non-null float64
BMI_male           139 non-null float64
GDP                139 non-null float64
BMI_female         139 non-null float64
life               139 non-null float64
child_mortality    139 non-null float64
dtypes: float64(9)
memory usage: 9.8 KB


In [39]:
df.head()

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality
0,34811060.0,2.73,0.1,3.328945,24.5962,12314.0,129.9049,75.3,29.5
1,19842250.0,6.43,2.0,1.474353,22.25083,7103.0,130.1247,58.3,192.0
2,40381860.0,2.24,0.5,4.78517,27.5017,14646.0,118.8915,75.5,15.4
3,2975029.0,1.4,0.1,1.804106,25.35542,7383.0,132.8108,72.5,20.0
4,21370350.0,1.96,0.1,18.016313,27.56373,41312.0,117.3755,81.5,5.2


In [40]:
df.tail()

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality
134,3350832.0,2.11,0.5,2.489764,26.39123,15317.0,124.2604,76.0,13.0
135,26952720.0,2.46,0.1,4.476669,25.32054,3733.0,124.3462,68.7,49.2
136,86589340.0,1.86,0.4,1.479347,20.9163,4085.0,121.9367,75.4,26.2
137,13114580.0,5.88,13.6,0.148982,20.68321,3039.0,132.4493,52.0,94.9
138,13495460.0,3.85,15.1,0.654323,22.0266,1286.0,131.9745,49.0,98.3


In [41]:
#Specifying the feature and target variables
X = df.drop(['life'], axis=1).values
y = df.life.values

In [42]:
# Training and Test Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [43]:
# Establishing the Pipeline Steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
         ('scaler', StandardScaler()),
         ('elasticnet', ElasticNet())]

pipeline = Pipeline(steps)
# Specifying the hyperparameter space
parameters = {'elasticnet__l1_ratio':np.linspace(0,1,30)}

In [44]:
# Creating the GridSearchCV object
gm_cv = GridSearchCV(pipeline, parameters, cv=5)
gm_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('imputation', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('elasticnet', ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'elasticnet__l1_ratio': array([ 0.     ,  0.03448,  0.06897,  0.10345,  0.13793,  0.17241,
        0.2069 ,  0.24138,  0.27586,  0.31034,  0.34483,  0.37931,
        0.41379,  0.44828,  0.48276,  0.51724,  0.55172,  0.58621,
        0.62069,  0.65517,  0.68966,  0.72414,  0.75862,  0.7931 ,
        0.82759,  0.86207,  0.89655,  0.93103,  0.96552,  1.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring

In [45]:
# Computation of the metrics
r2 = gm_cv.score(X_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))

Tuned ElasticNet Alpha: {'elasticnet__l1_ratio': 1.0}
Tuned ElasticNet R squared: 0.886201657089
