In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**This is a reference notebook to document regression model using Sklearn**

In [None]:
#import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.metrics.pairwise import rbf_kernel

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from pandas.plotting import scatter_matrix

In [None]:
#load dataset
housing = pd.read_csv("/kaggle/input/california-housing-prices/housing.csv")

# **Visuzlize the dataset**

In [None]:
#get the info of dataset
print(housing.info())
print(housing.describe())

In [None]:
#get number distinct values in ocean_proximity
print(housing["ocean_proximity"].value_counts())

In [None]:
#show the histograom of all the numeric columns
housing.hist(bins=50,figsize=(12,8))
plt.show()

In [None]:
#look for correlations, note only numeric columns shold be selected
corr_matrix = housing.select_dtypes(include='number').corr()
print(corr_matrix["median_house_value"].sort_values(ascending=False))

In [None]:
#create scatter matrix
attributes = ["median_house_value","median_income","total_rooms","housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))
plt.show()

In [None]:
#check median house value per ocean proximity category
df1 = housing.groupby("ocean_proximity")[["median_house_value"]].agg(["mean","median","count"])
print(df1)

In [None]:
#create scatter plot showing relation of house value with color and population with size with longitude and latitude
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True, 
             s=housing["population"]/100, label="ppoplulation", 
             c="median_house_value", cmap="jet", colorbar=True,
            legend=True, sharex=False, figsize=(10,7))
plt.show()

# Create Strtified train and test set based on median_income
**Then drop the median cat column**

In [None]:
#create new column with pd.cut creating bins for income category
housing["income_cat"]=pd.cut(housing["median_income"], bins=[0.,1.5, 3.0, 4.5, 6., np.inf], labels=[1,2,3,4,5])

In [None]:
#train test split
train_set, test_set = train_test_split(housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)
train_set.drop("income_cat", axis=1, inplace=True)
test_set.drop("income_cat", axis=1, inplace=True)
housing.drop("income_cat", axis=1, inplace=True)

In [None]:
print("Shape of raw dataset is {}".format(housing.shape))
print("Shape of train_set dataset is {}".format(train_set.shape))
print("Shape of test_set dataset is {}".format(test_set.shape))

# **Prepare data for model**

In [None]:
#divide train and test sets into inputs and labels
train_inputs = train_set.drop("median_house_value", axis=1)
train_labels = train_set["median_house_value"].copy()

test_inputs = test_set.drop("median_house_value", axis=1)
test_labels = test_set["median_house_value"].copy()

# Below is the Simple pipeline which preprocesses both numeric and categorical columns
# For a custom and more complex preprocessing refer below

In [None]:
#Column transformer to preprocess both numeric and categorical data
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

num_attribs = ["longitude","latitude","housing_median_age","total_rooms","total_bedrooms","population","households","median_income"]
cat_attribs = ["ocean_proximity"]

num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing_simple = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)
])


#alternatively preprocessing can automatically be created by just referring to numeric and categorical columns
'''
from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)
'''


# A more complex pre-processing with ratio of columns and cluster similarity

In [None]:
#Write custom class to detect Cluster Similarity
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, n_init=10,  gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        self.n_init = n_init
    
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state, n_init=self.n_init)
        self.kmeans_.fit(X,sample_weight=sample_weight)
        return self #always return self
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity"for i in range(self.n_clusters)]
    
    


In [None]:
#custom functions for ratio pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

def column_ratio(X):
    return X[:,[0]]/X[:,[1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"] #feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out = ratio_name),
        StandardScaler())

#log pipeline 

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out = "one-to-one"),
    StandardScaler())

#cluster_simil
cluster_simil = ClusterSimilarity(n_clusters=10, n_init=10, gamma=1., random_state=42)

default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing_complex = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house", ratio_pipeline(), ["total_rooms","households"]),
    ("people_per_house", ratio_pipeline(), ["population","households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms","population","households","median_income"]),
    ("geo", cluster_simil, ["latitude", "longitude"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    
],
remainder = default_num_pipeline)


# By any of the above two preprocessing sections you have developed a pipeline to pre-process data
# Lets transform the data now

In [None]:
#prepare trining data
train_inputs_prepared = preprocessing_complex.fit_transform(train_inputs)

print(train_inputs_prepared.shape)
preprocessing_complex.get_feature_names_out()

# Now its time to Select and Train Random Forest Model

In [None]:
#train random forest regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

forest_reg = make_pipeline(preprocessing_complex, RandomForestRegressor(random_state=42))
forest_reg.fit(train_inputs, train_labels)

In [None]:
#use cross validation for evaluation
from sklearn.model_selection import cross_val_score

forest_reg = -cross_val_score(forest_reg, train_inputs, train_labels, scoring="neg_root_mean_squared_error",cv=3)
pd.Series(forest_reg).describe()

# Using Grid Search CV to find optimal parameters

In [None]:
#use grid search cv
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

full_pipeline = Pipeline([
    ("preprocessing", preprocessing_complex),
    ("random_forest", RandomForestRegressor(random_state=42)),
])

param_grid = [
    {'preprocessing__geo__n_clusters': [5,8,10],
     'random_forest__max_features': [4, 6, 8],},
    
    {'preprocessing__geo__n_clusters': [10,15],
     'random_forest__max_features': [6, 8, 10],},
]

grid_search = GridSearchCV(full_pipeline, param_grid, cv=3, scoring="neg_root_mean_squared_error")
grid_search.fit(train_inputs, train_labels)


In [None]:
#get best parameters from grid_search_cv
print("Printing out Best Prameters")
print(grid_search.best_params_)


#get top results
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score",ascending=False, inplace=True)
print("Printing Top Results from Cross Validation")
print(cv_res.head())


# Use Random Search to find Best Parameters

In [None]:
#using random search 
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.pipeline import Pipeline

full_pipeline = Pipeline([
    ("preprocessing", preprocessing_complex),
    ("random_forest", RandomForestRegressor(random_state=42)),
])

param_distribs = {'preprocessing__geo__n_clusters': randint(low=3, high=50),
                  'random_forest__max_features': randint(low=2, high=20)}

rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions = param_distribs, n_iter = 10, cv=3,
    scoring = "neg_root_mean_squared_error", random_state=42)

rnd_search.fit(train_inputs, train_labels)

In [None]:
#get best model from random serarch
final_model = rnd_search.best_estimator_

#get important features
feature_importance = final_model["random_forest"].feature_importances_
feature_names = final_model["preprocessing"].get_feature_names_out()
print(sorted(zip(feature_importance, feature_names),reverse=True))

# Predict on Test Set from the best model from random search 

In [None]:
from sklearn.metrics import mean_squared_error

final_predictions = final_model.predict(test_inputs)

final_rmse = mean_squared_error(test_labels, final_predictions, squared=False)
print("Final RMSE is %.0f" %(final_rmse))

print("")
print("Final Predictions are")
print(final_predictions)

# Generate 95% Confidence Interval

In [None]:
#get 95% confidence interval
from scipy import stats
confidence = 0.95

squared_errors = (final_predictions - test_labels) ** 2
confidence_range = np.sqrt(stats.t.interval(confidence, len(squared_errors)-1,
                        loc=squared_errors.mean(),
                        scale=stats.sem(squared_errors)))

print("95% Confidence Range is {}".format(confidence_range))