In [1]:
import pandas as pd
import numpy as np
housing = pd.read_csv("housing.csv")

# Transformation Pipeline
- The Pipeline constructor takes a list of name/estimator pairs defining a sequence of steps. 
- All but the last estimator must be transformers (i.e., they must have a fit_transform() method). 
- The names can be anything you like (as long as they are unique and don’t contain double underscores “__”)


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# pipeline to handle numerical values
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

# transforming titanic_num data frame to use mean value in place of missing values
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
housing_num_tr_df = pd.DataFrame(housing_num_tr, columns=housing_num.columns)

## Transform categorical data and numerical data in single transformer

The constructor requires,
- a list of tuples where each tuple contains,
- a name,
- a transformer,
- and a list of names (or indices) of columns that the transformer should be applied to.

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = list(housing_cat)

full_pipeline = ColumnTransformer([ 
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_attribs), 
]) 
# returns a matrix
# handle_unknown is used to determine what to do if encounter unknown categories.
# This may happen if the test set does not contain some categories present in training set

housing_prepared = full_pipeline.fit_transform(housing)

Note that the OneHotEncoder returns a sparse matrix, while the num_pipeline returns a dense matrix. When there is such a mix of sparse and dense matrices, the Colum nTransformer estimates the density of the final matrix (i.e., the ratio of non-zero cells), and it returns a sparse matrix if the density is lower than a given threshold (by default, sparse_threshold=0.3).

# Training model and Evaluating on training set

### Linear Regression

In [9]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_target)

In [10]:
some_data = housing.iloc[:5]
some_target = housing_target.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print(lin_reg.predict(some_data_prepared))
print(some_target)

[[408504.]
 [424036.]
 [378476.]
 [321124.]
 [255856.]]
   median_house_value
0            452600.0
1            358500.0
2            352100.0
3            341300.0
4            342200.0


### Root mean squared error

In [12]:
from sklearn.metrics import mean_squared_error as mse
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mse(housing_target, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68709.30331593033

### Evaluation using train test split

- split the data in training set and testing set
- train the model using training set and test the model using testing set

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

train_set_features, test_set_features, train_set_target, test_set_target = train_test_split(housing_prepared, housing_target, test_size=0.2, random_state = 42)

tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_set_features, train_set_target)

def rmse(model, test_features, test_target):
    predictions = model.predict(test_features)
    model_mse = mse(test_target, predictions)
    model_rsme = np.sqrt(model_mse)
    return model_rsme

rmse(tree_reg, test_set_features, test_set_target)

69055.08189486346

# Remaining topics

- find corelations
- fine tuning
   - grid search
   - randomized search
