In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(0)

In [2]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
X, y = data.data, data.target
feature_names = data.feature_names

In [3]:
mat = np.column_stack((X, y))
df = pd.DataFrame(mat, columns=np.append(feature_names, 'MedianValue'))
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianValue
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

pipeline = Pipeline([    
    ('std_scaler', StandardScaler()),
    ('reg', Ridge())
])

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('std_scaler', StandardScaler()), ('reg', Ridge())])

In [7]:
train_score = pipeline.score(X_train, y_train)
print('R2 score on the training set:', np.round(train_score, 5))

R2 score on the training set: 0.6089


In [8]:
test_score = pipeline.score(X_test, y_test)
print('R2 score on the test set:', np.round(test_score, 5))

R2 score on the test set: 0.59431


In [19]:
y_pred = pipeline.predict(X_test[10].reshape(1, -1))
print(y_pred[0])

2.34947873736702


In [20]:
print(y_test[10])

2.379


Accessing the pipeline steps

In [17]:
print(pipeline.steps[1])

('reg', Ridge())


In [18]:
print(pipeline.named_steps['reg'])

Ridge()


In [46]:
pipeline.set_params(reg__alpha=0.1)

Pipeline(steps=[('std_scaler', StandardScaler()), ('reg', Ridge(alpha=1))])