# Pipelines

In [28]:
import pandas as pd 
import numpy as np

label = ['Country']
values = ['USA', 'China', 'Canada']

zipped = list(zip(label, [values]))
zipped = dict(zipped)
data_in_frame = pd.DataFrame(zipped)
print(data_in_frame)

  Country
0     USA
1   China
2  Canada


# Dummy Variables

In [25]:
#Two ways to get dummies, via pandas .get_dummies() or Scikit-Learn OneHotEncoder()
data_dummies = pd.get_dummies(data_in_frame)
data_dummies

Unnamed: 0,Country_Canada,Country_China,Country_USA
0,0,0,1
1,0,1,0
2,1,0,0


# Imputer

In [32]:
label = ['GDP']
values = [70, 72, 78, np.nan, 68, np.nan]

zipped = list(zip(label, [values]))
zipped = dict(zipped)
GDP = pd.DataFrame(zipped)
print(GDP)

    GDP
0  70.0
1  72.0
2  78.0
3   NaN
4  68.0
5   NaN


In [39]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0) #0=columns, 1=rows
imp.fit(GDP)
GDP = imp.transform(GDP)
GDP

array([[ 70.],
       [ 72.],
       [ 78.],
       [ 72.],
       [ 68.],
       [ 72.]])

# Imputting using Pipepline

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.model_selection import train_test_split

cancer = datasets.load_breast_cancer()
X = cancer.data 
y = cancer.target

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
scale = StandardScaler()
logreg = LogisticRegression()

steps = [('imputation', imp), #steps must be in order of transformers and last is an estimator
         ('scaler', scale),
        ('logistic_regression', logreg)]

pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
pipeline.score(X_test, y_test)

0.98245614035087714

In [None]:
#Ways to Normalize Data
#Standardization: Subtract the mean and divide by the variance 
#    -All features center around 0 and have variance 1
    
#Subtract the minimum and divide by the range 
#    -Minimum 0, maximum 1

#Other incluse scaling between -1 and 1

In [None]:
#Pipelines and Hyperparameter Tuning

In [None]:
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('SVM', SVC())]

pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
              'SVM__gamma':[0.1, 0.01]}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# Instantiate the GridSearchCV object: cv
cv = GridSearchCV(pipeline, parameters)

# Fit to the training set
cv.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))

In [None]:
#Another example

In [None]:
# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
         ('scaler', StandardScaler()),
         ('elasticnet', ElasticNet())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'elasticnet__l1_ratio':np.linspace(0,1,30)}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(pipeline, parameters)

# Fit to the training set
gm_cv.fit(X_train, y_train)

# Compute and print the metrics
r2 = gm_cv.score(X_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))
