# Making Necessary Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#to ignore warning
pd.options.mode.chained_assignment = None  # default='warn'

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Data

In [None]:
df=pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')

In this dataset, it is used these feature:

- id: house id
- date: date that the house was bought.
- price: house price
- bedrooms: number of bedroom
- bathrooms: number of bathroom
- sqft_living: Square foot Living
- sqft_lot: Square foot Lot
- floors: Number of floor
- waterfront: waterfront
- view: Number of view
- condition: condition
- grade: grade
- sqft_above: Square foot above
- sqft_basement: Square foot basement
- yr_built: year that house was built
- yr_renovated: year that house was renovated
- zipcode: zipcode
- lat: latitude
- long: longitude
- sqft_living15: Square foot Living in 2015
- sqft_lot15: Square foot Lot in 2015


In [None]:
df.head()

In [None]:
df.info()

# Preparing the Test set

In [None]:
df.corr()['price'].sort_values(ascending=False)

In [None]:
df['sqft_living'].hist(bins=100)

In [None]:
sns.kdeplot(df['sqft_living'])

In [None]:
#we do have a long tail and outliers
#let's also confirm by looking at boxplot
sns.boxplot(df['sqft_living'])

One way to tackel the problem is by eliminating the outliers and training the model, or we can also used stratified sampling.
let's first check the most common range of highly correlated feature that is sqft_living

In [None]:
df.describe().transpose()

it can be seen that most of the data is divided into 1.5-2.5

In [None]:
#let's do a stratified random split
df['sqft_living_cat']=pd.cut(df['sqft_living'],bins=[0.,1.5e3,2.5e3,3.5e3,4.5e3,5.5e3,np.inf],labels=[1,2,3,4,5,6])

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit 

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=101)

In [None]:
for train_index, test_index in split.split(df,df['sqft_living_cat']):
    strat_train_set=df.loc[train_index]
    strat_test_set=df.loc[test_index]

In [None]:
#let's check the proportion it has gathered
df['sqft_living_cat'].value_counts()/len(df)

In [None]:
strat_train_set['sqft_living_cat'].value_counts()/len(df)

In [None]:
for set_ in strat_test_set,strat_train_set:
    set_.drop('sqft_living_cat',axis=1,inplace=True)

# Exploring the data

In [None]:
#let's explore the data
housing=strat_train_set.copy()
housing_tr=strat_train_set.drop('price',axis=1)
housing_label=strat_train_set['price'].copy()

In [None]:
#temp
X_test=strat_test_set.drop('price',axis=1)
y_test=strat_test_set['price'].copy()

In [None]:
housing.head()

In [None]:
# let's try plotting long vs lat to check how prices are distributed over region
ax=housing.plot(kind="scatter",x='long',y='lat',figsize=(12,6),c="price",cmap=plt.get_cmap("plasma"))

The houses near the water tends to be expensive. 

In [None]:
#let's explore the bedroom and bathrooms feature
sns.boxplot(x='bedrooms',data=housing,orient='v')

In [None]:
sns.boxplot(x='bathrooms',data=housing,orient='v')

There indeed are many outliers in the data.
For now we are keeping them but if needed, we can eliminate them in future.

# Categorical Features

Let's explore the columns of waterfront, view, yr_renovated

In [None]:
#first check waterfront
housing['waterfront'].value_counts()
#there's only 2 values so we can simply convert into an answer of yes or no

In [None]:
#let's check view
housing['view'].value_counts()

In [None]:
housing['view'].plot(kind="hist")
#views are very thinnely distributed
#let's try to treat also as yes or no

In [None]:
#let's explore yr_renovated
def renov_check(x):
    if x==0:
        return 0
    else:
        return 1
    
renovation_check=housing.yr_renovated.apply(renov_check)

In [None]:
plt.hist(renovation_check);
#as we guessed correct there are too fews houses that are renovated

In [None]:
#let's create a custom tranfrormer that handles these columns
from sklearn.base import BaseEstimator,TransformerMixin
class yes_no (BaseEstimator,TransformerMixin):
    def __init__ (self):
        pass
    def fit(self,X,y=None):
        return self
    def binary_creator(self,x):
        if x == 0:
            return "No"
        else:
            return "Yes"
    def transform(self,X,y=None):
        X.loc[:,"waterfront"]=X['waterfront'].apply(self.binary_creator)
        X.loc[:,'view']=X['view'].apply(self.binary_creator)
        X.loc[:,'yr_renovated']=X['yr_renovated'].apply(self.binary_creator)
        return X.values

In [None]:
yesno=yes_no()
pd.DataFrame(yesno.fit_transform(housing))

In [None]:
#let's handle a date column
from sklearn.base import BaseEstimator, TransformerMixin

class date_extractor(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def get_year( self, obj ):
        return str(obj)[:4]
    
    #Helper function to extract month from column 'dates'
    def get_month( self, obj ):
        return str(obj)[4:6]
    
    #Helper function to extract day from column 'dates'
    def get_day(self, obj):
        return str(obj)[6:8]
    def transform(self,X,y=None):
        X.loc[:,'Year']=X['date'].apply(self.get_year)
        X.loc[:,'Month']=X['date'].apply(self.get_month)
        X.loc[:,'Day']=X['date'].apply(self.get_day)
        X=X.drop('date',axis=1)
        return X.values

In [None]:
date=date_extractor()
pd.DataFrame(date.fit_transform(housing))

So far we have handled the 4 data columns. 
Let's create a custom transformer that selects the columns we want to keep

In [None]:
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self._feature_names ] 

So far we have sucessfully handled the categorical data, let's wrapped up all of these task into a clean transformer.

In [None]:
class categorical_transformer(BaseEstimator,TransformerMixin):
    def init(self):
        pass
    def fit(self,X,y=None):
        return self
    #Helper function to extract year from column 'dates'
    def get_year( self, obj ):
        return str(obj)[:4]
    
    #Helper function to extract month from column 'dates'
    def get_month( self, obj ):
        return str(obj)[4:6]
    
    #Helper function to extract day from column 'dates'
    def get_day(self, obj):
        return str(obj)[6:8]
    def binary_creator(self,x):
        if x == 0:
            return "No"
        else:
            return "Yes"    
    def transform(self,X,y=None):
        X.loc[:,'Year']=X['date'].apply(self.get_year)
        X.loc[:,'Month']=X['date'].apply(self.get_month)
        X.loc[:,'Day']=X['date'].apply(self.get_day)
        X=X.drop('date',axis=1)
        X.loc[:,"waterfront"]=X['waterfront'].apply(self.binary_creator)
        X.loc[:,'view']=X['view'].apply(self.binary_creator)
        X.loc[:,'yr_renovated']=X['yr_renovated'].apply(self.binary_creator)
        return X.values        

Now wrap all of the important steps into a categorical pipeline.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [None]:
cat_features=['date','waterfront','view','yr_renovated']
cat_pipeline=Pipeline(steps=[('cat_feature_selector',FeatureSelector(cat_features)),
                             ('cat_transformer',categorical_transformer()),
                             ('one_hot_encoder',OneHotEncoder(sparse=False))
    
])

In [None]:
cat_pipeline.fit_transform(housing)

# Numerical Features

So far we have handled the categorical data let's try handling numerical data.

In [None]:
housing.info()

In [None]:
# the numerical features that we are going to select are 
num_features=['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
                'condition', 'grade','sqft_above','sqft_basement', 'yr_built','lat','long']

In [None]:
#let's try combiniation of attributes but we will surely gate it with hyperparameter
housing['bath_per_bed']=housing['bedrooms']/housing['bathrooms']

In [None]:
housing.corr()['price'].sort_values(ascending=False)
#it doesnot look too much coorelated but we are going to keep it for now
#it tells that the prices will go down if there are less bathrooms per bedrooms

In [None]:
#let's also check if recently created houses have high correlation with the price
housing['years_old']=housing['yr_built']-2020

In [None]:
housing.corr()['price'].sort_values(ascending=False)
#not really so we are going to keep it simple and drop this combination argument

In [None]:
class numerical_transformer(BaseEstimator,TransformerMixin):
    def __init__(self,bath_per_bed=True):
        self._bath_per_bed=bath_per_bed
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        if self._bath_per_bed:
            X.loc[:,'bath_per_bed']=X['bathrooms']/X['bedrooms']
            X.drop('bathrooms',axis=1)
        #Converting any infinity values in the dataset to Nan
        X = X.replace( [ np.inf, -np.inf ], np.nan )       
        return X.values

In [None]:
data1=numerical_transformer()
data1.fit_transform(housing)

Now let's wrap our numerical features into numerical pipeline.

In [None]:
#as a good practice, let's also include imputer to handle with any missing value
from sklearn.impute import SimpleImputer
#also import standard scalar
from sklearn.preprocessing import StandardScaler

In [None]:
num_pipeline=Pipeline(steps=[('num_feature_selector',FeatureSelector(num_features)),
                             ('num_transformer',numerical_transformer()),
                             ('num_imputer',SimpleImputer(strategy='median')),
                             ('num_scalar',StandardScaler())
                            ])

In [None]:
num_pipeline.fit_transform(housing)

# Final Pipeline

In [None]:
#let's combine both numeric and categorical pipeline into one clean pipeline
from sklearn.pipeline import FeatureUnion

In [None]:
full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', cat_pipeline ), 
                                                  
                                                  ( 'numerical_pipeline', num_pipeline ) ] )

In [None]:
full_pipeline.fit_transform(housing)

# Training the model

Let's try simple plain linear regression first and check the results.

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_reg=LinearRegression()

In [None]:
housing_prepared=full_pipeline.fit_transform(housing_tr)

In [None]:
lin_reg.fit(housing_prepared,housing_label)

In [None]:
#let's check how it performed
from sklearn.metrics import mean_squared_error

In [None]:
y_pred=lin_reg.predict(housing_prepared)

In [None]:
np.sqrt(mean_squared_error(housing_label,y_pred))

The error margin is too big. Let's also try by evaluating using cross validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores=cross_val_score(lin_reg,housing_prepared,housing_label,scoring='neg_mean_squared_error',cv=10)

In [None]:
def display_score(scores):
    scores=np.sqrt(-scores)
    print("Scores:", scores)
    print('\n')
    print("The average score is",np.mean(scores))
    print('\n')
    print("Standard deviation:", scores.std())

In [None]:
display_score(scores)

In [None]:
#let's try a slightly advance algorithm like descision tree
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree_reg=DecisionTreeRegressor()

In [None]:
tree_reg.fit(housing_prepared,housing_label)

In [None]:
y_pred=tree_reg.predict(housing_prepared)

In [None]:
np.sqrt(mean_squared_error(housing_label,y_pred))

This shows the decision tree has badly overfit. Let's also check cross_val_score

In [None]:
scores=cross_val_score(tree_reg,housing_prepared,housing_label,cv=10,scoring='neg_mean_squared_error')

In [None]:
display_score(scores)

It performed slightly better than the linear regression.

Let's check using random forest.

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest_reg=RandomForestRegressor(n_estimators=100)

In [None]:
scores=cross_val_score(forest_reg,housing_prepared,housing_label,cv=10,scoring='neg_mean_squared_error')

In [None]:
display_score(scores)

It performed significantly better let's try to fine tune model more so that we can attain more good score.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = [
{'n_estimators': [3, 10, 30,50,100], 'max_features': [2, 4, 6, 8,12]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg=RandomForestRegressor()
grid_search=GridSearchCV(forest_reg,param_grid=param_grid,scoring='neg_mean_squared_error',return_train_score=True,cv=5)

In [None]:
grid_search.fit(housing_prepared,housing_label)

In [None]:
grid_search.best_params_

In [None]:
import joblib 
joblib.dump(grid_search,'grid_search.pkl')

In [None]:
grid_search.best_params_

In [None]:
# let's check how was the performance of each of the hyperparameter
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
#as for now we are going to train the data on both the simplified version and the grid search one
forest_reg=RandomForestRegressor(n_estimators=100)
forest_reg.fit(housing_prepared,housing_label)

# Evaluation on Test set

In [None]:
test_prepared=full_pipeline.transform(X_test)

In [None]:
y_pred_forest=forest_reg.predict(test_prepared)

In [None]:
np.sqrt(mean_squared_error(y_test,y_pred_forest))

In [None]:
y_pred_gscv=grid_search.predict(test_prepared)

In [None]:
np.sqrt(mean_squared_error(y_test,y_pred_gscv))

We are going to keep the normal model as it performed better on the test data.