# Housing Prices Advanced Regression Challenge
## (End-to-End Machine Learning Project)

## Importing the Data

In [None]:
import pandas as pd
housing = pd.read_csv('../input/housing/housing.csv')

## Taking a Quick Look at the Data Structure

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize= (20,15))
plt.show()

## Creating a Test Set

In [None]:
import numpy as np 

In [None]:
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)

In [None]:
len(train_set), len(test_set)

In [None]:
from zlib import crc32

In [None]:
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

In [None]:
def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[-in_test_set], data.loc[in_test_set]

In [None]:
housing_with_id = housing.reset_index() #adds an 'index' column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

In [None]:
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"],
                              bins = [0., 1.5,3.0, 4.5, 6., np.inf],
                              labels = [1, 2, 3, 4, 5])

In [None]:
housing["income_cat"].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set["income_cat"].value_counts()/len(strat_test_set)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis = 1, inplace = True)

## Discovering and Visualizing the Data to Gain Insights

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind="scatter", x ="longitude",y="latitude", alpha = 0.1)

In [None]:
housing.plot(kind="scatter", x ="longitude",y="latitude", alpha = 0.4, s = housing["population"]/100,
            label = "population", figsize = (10,7), c = "median_house_value", cmap = plt.get_cmap("jet"),
            colorbar = True)
plt.legend()

### Looking for Correlations

We can easily compute the standard correlation coefficient (also called Pearson's r) between every pair of attributes using the corr() method:

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending = False)

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize = (12,8));

In [None]:
housing.plot(kind = "scatter", x ="median_income", y = "median_house_value", alpha = 0.1)

### Experimenting with Attribute Combinations

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending = False)

## Preparing the Data for Machine Learning Algorithms

In [None]:
housing = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()

### Data Cleaning

Treating missing values of attribute "total_bedrooms"
Three options:
1. Get rid of the corresponding values.
2. Get rid of the whole attribute.
3. Set the values to some value (zero, the mean, the median, etc.)

In [None]:
housing.dropna(subset=["total_bedrooms"])   #option 1
housing.drop("total_bedrooms",axis = 1)     #option 2
median = housing["total_bedrooms"].median() #option 3
housing["total_bedrooms"].fillna(median, inplace = True)

In [None]:
# a handy class to take care of missing values
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy = "median")

In [None]:
housing_num = housing.drop("ocean_proximity", axis = 1)

In [None]:
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
#Transforming the training set by replacing missing values by the learned medians
X = imputer.transform(housing_num)

In [None]:
housing_tr = pd.DataFrame(X, columns = housing_num.columns)

### Handling Text and Categorical Attributes

In [None]:
housing_cat = housing[["ocean_proximity"]]

In [None]:
housing_cat.head(10)

In [None]:
#Converting from texts to numbers

from sklearn.preprocessing import OrdinalEncoder

In [None]:
ordinal_encoder = OrdinalEncoder()

In [None]:
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)

In [None]:
housing_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

In [None]:
# OneHotEncoder class to convert categorical values into one-hot vectors

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

The output is Scipy Sparse Matrix, instead of NumPy array.
A sparse matrix only stores the location of the non-zero elements, instead of using up tons of memory mostly to store zeros.

In [None]:
housing_cat_1hot.toarray() # If you really want to convert it to a dense NumPy array

In [None]:
cat_encoder.categories_

### Custom Transformers

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

In [None]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): #no *args or **kwargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y = None):
        return self # Nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix]/X[:,households_ix]
        population_per_household = X[:, population_ix]/X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room =False)
housing_extra_attribs = attr_adder.transform(housing.values)

### Feature Scaling

There are two common ways to get all attributes to have the same scale: min-max scaling(normalization) and standardization.

- Min_max scaling: values are shifted and rescaled so the they end up ranging from 0 to 1. We do this by subtracting the min value and dividing by the max minus the min. Scikit-Learn provides MinMaxScaler for this.

- Standardization: first it subtracts the mean value (so standardized values always have a zero mean), and them it divides by the standard deviation so the the resulting distribution has unit variance.

### Transformation Pipelines

There are many transformation steps that need to be executed in the right order.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = "median")),
    ('attribs_adder',CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

In [None]:
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
from sklearn.compose import ColumnTransformer # A single transformer able to handle all columns

In [None]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

In [None]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(),cat_attribs),
])

In [None]:
housing_prepared = full_pipeline.fit_transform(housing)

## Selecting and Training a model
### Training and Evaluating on the Training Set

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))

In [None]:
print("Labels:", list(some_labels))

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
# Decision Tree

from sklearn.tree import DecisionTreeRegressor

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

### Better Evaluation using Cross-Validation

Scikit-Learn's K-fold cross-validation feature: This splits the training set into 10 distinct subsets called folds, then it trains and evaluates the Decision Tree model 10 times, picking a different fold for evaluation every time and training on the other 9 folds. The result is an array containing the 10 evaluation scores.

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("Scores:",scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
display_scores(tree_rmse_scores)

In [None]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv=10)

In [None]:
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [None]:
# Random_Forest

from sklearn.ensemble import RandomForestRegressor

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv=10)

In [None]:
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

## Fine-Tuning the Model
### Grid Search
Scikit-Learn's GridSearchCV experiments with the hyperparameters and try the values, inputed by the user and it will evaluate all the possible combinations of hyperparameter values using cross-validation.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = [
    {'n_estimators':[3,10,30], 'max_features':[2,4,6,8]},
    {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]} 
]

In [None]:
forest_reg = RandomForestRegressor()

In [None]:
grid_search = GridSearchCV(forest_reg, param_grid, cv = 5, scoring='neg_mean_squared_error', return_train_score=True)

In [None]:
grid_search.fit(housing_prepared, housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_

In [None]:
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

### Randomized Search
When the hyperparameter search space is large, it is often preferable to use RandomizedSearchCV instead.
It evaluates a given number of random combinations by selecting a random value for each hyperparameter at every iteration.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

### Ensemble Methods
Another way to fine-tune the system is to try to combine the models that perform best. The group (or "ensemble") will often perform better than the best individual model.

### Analyzing the Best Models and Their Errors

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse = True)

### Evaluating the System on the Test Set

In [None]:
final_model = grid_search.best_estimator_

In [None]:
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

In [None]:
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse

In [None]:
from scipy import stats

In [None]:
confidence = 0.95
squared_errors = (final_predictions - y_test)**2
np.sqrt(stats.t.interval(confidence, len(squared_errors)-1,
                        loc = squared_errors.mean(),
                        scale = stats.sem(squared_errors)))

### A Full Pipeline with both preparation and prediction

In [None]:
full_pipeline_with_predictor = Pipeline([
    ("preparation", full_pipeline),
    ("linear", LinearRegression())
])

full_pipeline_with_predictor.fit(housing, housing_labels)
full_pipeline_with_predictor.predict(some_data)

If you liked this notebook please upvote and share your feedback.
Thanks!