In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
Data = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')
Data.head()

In [None]:
Data.info()

In [None]:
Data.describe()

In [None]:
Data['ocean_proximity'].value_counts()

In [None]:
Data.hist(bins = 50, figsize = (20,15))
plt.show()

Here, we split the data using unique hash values. If we randomly generate the train and test set then, for every time we restart the data will again get mixed up and maybe the test set may contain values used previously in the training data. This is just a practice code for train Test split without using the sklearn train_test_split library.

In [None]:
from zlib import crc32
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

Data_with_id = Data.reset_index() # adds an `index` column
train_set, test_set = split_train_test_by_id(Data_with_id, 0.2, "index")

In [None]:
Data_with_id.info()

Splitting the data into test set and train Set using sklearn train_test_split.

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(Data, test_size=0.2, random_state=42)

As we observe that the income is an important attribute. To avoid biasing we devide the important attribute into homogeneous groups called strata and then the right number of samples are distributed from each stratum to the training and the testing data. Further we are storing it into Income Category column.

In [None]:
Data['Income Category'] = pd.cut(Data["median_income"],bins=[0., 1.5, 3.0, 4.5, 6., np.inf],labels=[1, 2, 3, 4, 5])
Data['Income Category'].hist()
plt.show()

# StratifiedShuffleSplit
is a sampling method in sklearn used for sampling based on equally distributed starta of a particular Attribute which is/are important. Here we are just giving a single split to the data since we want the data to be equally distributed in the test set and the train set.
***After the split we use the hashed index to distribute the data to the train set and test set.***

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(Data, Data["Income Category"]):
    strat_train_set = Data.loc[train_index]
    strat_test_set = Data.loc[test_index]

This Shows how the Income Category is properly distributed according to their proportions.

In [None]:
strat_train_set['Income Category'].value_counts()/len(strat_train_set)

Now, there is no need of Income Category and hence we drop the same.

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("Income Category", axis=1, inplace=True)

# Data Visualization

Visualizing the locations of the Districts. 

In [None]:
Data = strat_train_set.copy()
Data.plot(kind="scatter", x="longitude", y="latitude")

Changing the alpha from Default to 0.1 gives us the densed areas.

There are a some ***high-density areas***, namely the Bay Area and around Los Angeles and San Diego, plus a long line of fairly high density in the Central Valley, in particular around Sacramento and Fresno.

In [None]:
Data.plot(kind="scatter", x="longitude", y="latitude",alpha = 0.1)

The radius of each circle represents the district’s population (option s ), and the color represents the price (option c ). jet is a predefined color map (option cmap ), which ranges from blue (low values) to red (high prices).

In [None]:
Data.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,s=Data["population"]/100, label="population", figsize=(10,7),c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
plt.legend()

# Feature Correlations
Since the dataset is not too large, we can compute a correlation between the features using the standard correlation coeffecient**(Pearson's r)**.

In [None]:
corr_matrix = Data.corr()

Closeness to +1 refers to higher (Directly proportional) dependency of the attribute with the target attribute. Negative correlation(closeness to -1) indicates the lower (inversely proportional) dependency. While closeness to zero shows no linear dependecy of the two features.
***The correlation coeffecients only shows the linear dependencies.***

In [None]:
corr_matrix['median_house_value'].sort_values(ascending = False)

The pandas.plotting.scatter_matrix gives the visualization of Data dependencies.
Plots of some important feature dependencies - 

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms","housing_median_age"]
scatter_matrix(Data[attributes], figsize=(12, 8))
plt.show()

In the below plot we can clearly see the higher dependency of median_income to median_house values.
*The Horizontal line clearly shows the capping of values.*

In [None]:
Data.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

# Feature Extraction

In [None]:
Data['rooms_per_house'] = Data['total_rooms']/Data['households']
Data['bedrooms_per_room'] = Data['total_bedrooms']/Data['total_rooms']
Data['population_per_household'] = Data['population']/Data['households']

corr_matrix = Data.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

Reverting to a clean training set (by copying strat_train_set once again).
Separating the predictors and the labels(drop() creates a copy of the data and does not affect strat_train_set )

In [None]:
Data = strat_train_set.drop("median_house_value", axis=1)
Data_labels = strat_train_set["median_house_value"].copy()

* '''*Used to drop the rows with null values*''' - 
housing.dropna(subset=["total_bedrooms"])
* '''*Used to drop the entire column, also used earlier to drop the labels from the training data*''' - 
housing.drop("total_bedrooms", axis=1)
* '''*Used to compute the median of all the values which are not null*''' - 
median = housing["total_bedrooms"].median()
* '''*Further the missing values are filled with the median valuer of the attribute*''' - 
housing["total_bedrooms"].fillna(median, inplace=True)
* If we are using the median to replace the na values then, we will save the median value to replace the na values in the test set.

**Handling Missing Values**

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')
data_num = Data.drop('ocean_proximity',axis = 1)
imputer.fit(data_num)
''' simple imputer has an attribute statistics_ in which it saves the median values (imputer.statistics_)'''
data_num.median().values

Passing the numerical data to the imputer so that it fills the missing values with the learned median that it has computed.

In [None]:
X = imputer.fit(data_num)
Data_tr = pd.DataFrame(X, columns = data_num.columns, index = data_num.index)

In sklearn any object that can estimate some parameters based on some attributes is called the estimator(eg. imputer). The estimation is performed by the fit method. Any parameter other than the training and test dataset(such as imputer strategy) needed to guide the estimation process is calleed a hyperparamter, and it must be set as an instance parameter.

In [None]:
data_cat = Data[['ocean_proximity']]
data_cat.head(10)

**Converting Categorical Data to Numeric Data**

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoded_categories = encoder.fit_transform(data_cat)
encoded_categories

In [None]:
encoder.categories_

Here there is only a single categorical attribute, but when dealing with multiple categorical values we will get a large number of training features which might slow down the training. If this happens we can replace the categories with useful numerical data. Example, we can replace the ocean_proximity feature with the distance to the ocean. Alternatively, we can replace each category with a learnable, low-dimensional vector called an embedding. This is an example of representation learning.

In [None]:
encoded_categories.toarray()

**custom transformer to form the combined attributes(cell 21).**
* TransformerMixin - provides the fit_transform method, provided we have to create the fit method and the transform method. 
* BaseEstimator - used as a base class (avoids \*args and \*\*kargs in the constructor), we will also get two extra methods ( get_params() and set_params() ) that will be useful for automatic hyperparameter tuning.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        '''we are gateing this parameter since in future we can find out whether this parameter helps the algo'''
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
        return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(Data.values)

# creating a pipeline for all data transformation steps

In [None]:
from sklearn.pipeline import Pipeline
#StandardScaler - feature scaling
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),('attribs_adder', CombinedAttributesAdder()),('std_scaler', StandardScaler()),])

Calling of the pipeline’s fit() method, it calls fit_transform() sequentially on all transformers, passing the output of each call as the parameter to the next call until it reaches the final estimator, for which it calls the fit() method. The pipeline exposes the same methods as the final estimator.

In [None]:
housing_num_tr = num_pipeline.fit_transform(data_num)

The column transformer transforms both the numerical and categorical column at the same time.

In [None]:
from sklearn.compose import ColumnTransformer
num_attribs = list(data_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs),("cat", OneHotEncoder(), cat_attribs),])
housing_prepared = full_pipeline.fit_transform(Data)

Now, we are finished with the exploration and preprocessing of the data. Finally, it's time to select a model and train it.
# **Linear Regressor Model**

In [None]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(housing_prepared,Data_labels)

Now we predict some random data from the Dataset.

In [None]:
some_data = Data.iloc[:5]
some_labels = Data_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lr_model.predict(some_data_prepared))
print("Labels:", list(some_labels))

In the below cell we are calculating the error of our model.

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = lr_model.predict(housing_prepared)
lr_mse = mean_squared_error(Data_labels, housing_predictions)
lr_rmse = np.sqrt(lr_mse)
lr_rmse

We will not touch the test set until we are ready to launch a confident model, so we need to use part of the training set for training and part of it for model validation.<br/>
We use the sklearn k fold cross validation to divide the training Data into 10 folds and training on 9 parts while evaluating on the 10th part. We are computing the sqrt of negative scores since cross validation is a utility function which should be maximized while MSE is a cost function which should be minimized.

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
from sklearn.model_selection import cross_val_score
lin_scores = cross_val_score(lr_model, housing_prepared, Data_labels,scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Since in the Linear Regression model is under fitting as we can see, we can simply look for more powerful Models such as Decision Tree Regressor. Because an error of 69052.46 is not at all good when the highest and the lowest Data labels are of value 265000 and 120000 respectively.
# **Decision Tree Regressor**

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor()
dt_model.fit(housing_prepared, Data_labels)

housing_predictions = dt_model.predict(housing_prepared)
tree_mse = mean_squared_error(Data_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

Since in the above step, we can see that the model has 0 loss. Here, the model is badly overfitting the Data. For the assurance that the model is overfitting we can simply evaluate it using cross validation like we did Earlier.

In [None]:
dt_scores = cross_val_score(dt_model, housing_prepared, Data_labels,scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-dt_scores)
display_scores(tree_rmse_scores)

Now, the score is even worse than the Linear Regression Score. Let's try random forest regressor.
# **Random Forest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor()
rf_model.fit(housing_prepared, Data_labels)

housing_predictions = rf_model.predict(housing_prepared)
forest_mse = mean_squared_error(Data_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
rf_scores = cross_val_score(rf_model, housing_prepared, Data_labels,scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-rf_scores)
display_scores(forest_rmse_scores)

The Random forest regresssor gives the minimum of the MSE among the three models that we have used.<br/>
*We can further use many other models. But for time being we will use these 3 models.*

# Fine-Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,scoring='neg_mean_squared_error',return_train_score=True)
grid_search.fit(housing_prepared, Data_labels)

When we have no idea what value a hyperparameter should have,a simple approach is to try out consecutive powers of 10 (or a smaller number if you want a more fine-grained search, as shown above with the n_estimators hyperparameter).

In [None]:
grid_search.best_params_

We can also print all the evaluation scores by using the cv_results_ instance variable.

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

Now we have fine tuned our best MSE giving model, and set the max_features hyperparameter to 6 and the n_estimators hyperparameter to 30. The results are however only Slightly better than the previous result.<br/>
*Further we will analyze our best model with the  best hyperparameters.*
# Analyzing the best Model

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

Now it's time to evaluate our Model.
# Model evaluation

In [None]:
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse