In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
## imported library for visualizations
import warnings
warnings.filterwarnings("ignore")

In [None]:
cf_data = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')
## load data and check head of the data
cf_data.head()

In [None]:
## check shape of the data

cf_data.shape

In [None]:
## check info of the dataset

cf_data.info()

In [None]:
cf_data.describe(include="all")

In [None]:
## check the missing values percentage

round(100*(cf_data.isnull().sum()/cf_data.shape[0]),3)

* Only total bedrooms has missig values.

In [None]:
## check my only one categorical variable

cf_data['ocean_proximity'].value_counts(normalize=True)



In [None]:
## visualize ocean proximity

plt.figure(figsize=(12,6))
sns.countplot(x='ocean_proximity',data=cf_data)
plt.show()

In [None]:
## create one copy for visualization

cf_data_copy = cf_data.copy()

In [None]:
## convert categorical variable into numerical variables using dummy encoding

stat = pd.get_dummies(cf_data['ocean_proximity'],drop_first=True)

In [None]:
## concat stat and dro original categorical variable

cf_data = pd.concat([cf_data,stat],axis=True)
cf_data.drop('ocean_proximity',axis=1,inplace=True)

* We will use the iterative imputer to impute missing values . We will use linear regression model to predict the missing value using other columns.

In [None]:
import sklearn 
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

iterimp = IterativeImputer(estimator=LinearRegression(),random_state=100)
cf_data_clean = pd.DataFrame(iterimp.fit_transform(cf_data))


In [None]:
cf_data_clean.columns = cf_data.columns.tolist() ## rename our new data frame 

In [None]:
total = 0
for i in cf_data_clean.columns.tolist():
    total = total + cf_data_clean[i].isnull().sum()
print("Total Number Of Null Values : ",total)

* Let's now check our column whose null value is imputed. First we will check mean of that column before imputation and after imputation statistically.

In [None]:
## check before imputation stats

cf_data['total_bedrooms'].describe()

In [None]:
## check after imputation stats

cf_data_clean['total_bedrooms'].describe()

* There is a very minor difference between before mean and after mean of total bedrooms variable.

* To check more clearly we will do a hypothesis test on it 

Null Hypothesis : mean before imputation = mean after imputation
Alternative Hypothesis : mean before imputation != mean after imputation

we will use scipy to do it

In [None]:
import scipy 
from scipy.stats import ttest_ind
ttest_ind(cf_data['total_bedrooms'],cf_data_clean['total_bedrooms'],nan_policy='omit')

* From the p value we fail to reject the null hypothesis.

# EDA

In [None]:
plots_var = ['housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_value']

plt.figure(figsize=(30,15))

for i in enumerate(plots_var):
    plt.subplot(2,4,i[0]+1)
    ax = sns.distplot(cf_data_clean[i[1]])
    ax.set_xlabel(i[1],fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
## visualize house value with respect to occen proximity

plt.figure(figsize=(12,6))
sns.boxplot(x='ocean_proximity',y='median_house_value',data=cf_data_copy)
plt.show()

* Those houses which are in island areas are all costly compared to other houses
* Some houses are very costly which are inland areas also.

In [None]:
## visualize household with respect to occen proximity

plt.figure(figsize=(12,6))
sns.boxplot(x='ocean_proximity',y='median_house_value',data=cf_data_copy)
plt.show()

In [None]:
## visualize house value with respect to occen proximity

plt.figure(figsize=(12,6))
sns.boxplot(x='ocean_proximity',y='households',data=cf_data_copy)
plt.show()

* Maximum house holds stays at less than 1h ocean than inland area then near ocean and after that near bay and at last island areas.

In [None]:
## visualize population value with respect to occen proximity

plt.figure(figsize=(12,6))
sns.boxplot(x='ocean_proximity',y='population',data=cf_data_copy)
plt.show()

* Maximum population stays at less than 1h ocean than inland area then near ocean and after that near bay and at last island areas.

In [None]:
## visualize housing median age value with respect to occen proximity

plt.figure(figsize=(12,6))
sns.boxplot(x='ocean_proximity',y='housing_median_age',data=cf_data_copy)
plt.show()

* Near bay and island houses are older than other area houses.

* create one boundary box to define our range of geo spatial data

In [None]:
BBox = ((cf_data_clean.longitude.min(),   cf_data_clean.longitude.max(),      
         cf_data_clean.latitude.min(), cf_data_clean.latitude.max()))
        
        

* Save the image as mymap.

In [None]:
mymap = plt.imread('../input/california-map/map.png') 

* Import the map from  opestreetmap.org website and export the desired map as an image by first entering the bounding box data.


In [None]:
plt.figure(figsize = (30,15))
ax=sns.scatterplot(x='longitude', y='latitude',data = cf_data_copy ,hue='ocean_proximity' ,alpha = 0.5)
ax.set_title('Plotting Spatial Data on California Map')
ax.set_xlim(BBox[0],BBox[1])
ax.set_ylim(BBox[2],BBox[3])
ax.imshow(mymap, zorder=0, extent = BBox, aspect= 'equal')

plt.show()

* Clusters are clearly visible on the above data.

In [None]:
## check the pairplots

sns.pairplot(cf_data_copy)
plt.show()

* Total Bedrooms and Household , Total Rooms and Total bedrooms are having a linear relationhip.

# Data Preproccessing

In [None]:
## first make train test split 

import sklearn
from sklearn.model_selection import train_test_split

train,test = train_test_split(cf_data_clean,train_size=0.7,random_state=42)

In [None]:
## use power transform to make data outlier robust and less skewed

from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer()

train = pd.DataFrame(pt.fit_transform(train))
test = pd.DataFrame(pt.transform(test))

 

In [None]:
## use scaler transform for linear regression

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

train = pd.DataFrame(sc.fit_transform(train))
test = pd.DataFrame(sc.transform(test))

In [None]:
## renaming train and test

train.columns = cf_data_clean.columns.tolist()
test.columns = cf_data_clean.columns.tolist()

In [None]:
y_train = train.pop('median_house_value') ## train split
X_train = train

In [None]:
y_test = test.pop('median_house_value') ## test split
X_test = test

# Model Build (Linear Regression)

In [None]:
## use linear regression model to predict

from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(X_train,y_train)


In [None]:
## let's predict the test data and check r2 score 

from sklearn.metrics import r2_score

score1 = r2_score(y_test,lr.predict(X_test))


In [None]:
## score of our first model

score1

# Model Build(Linear Regression using Stats model)

In [None]:
## use of stats models to check the multi colinearity 

import statsmodels.api as sm

lr2 = sm.OLS(y_train,sm.add_constant(X_train)).fit()

In [None]:
print(lr2.summary())

In [None]:

from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['features'] = X_train.columns
vif['vif'] = [variance_inflation_factor(X_train.values,i) for i in range(X_train.shape[1])]
vif['vif'] = round(vif['vif'],2)
vif = vif.sort_values(by='vif',ascending=False)
#vif.drop(vif.index[0],inplace=True)
vif

In [None]:
## hence remove total_bedrooms

X_train_sm = X_train.copy()
X_train_sm.drop('total_bedrooms',inplace=True,axis=1)

In [None]:
lr3 = sm.OLS(y_train,sm.add_constant(X_train_sm)).fit()
print(lr3.summary())

In [None]:
vif = pd.DataFrame()
vif['features'] = X_train_sm.columns
vif['vif'] = [variance_inflation_factor(X_train_sm.values,i) for i in range(X_train_sm.shape[1])]
vif['vif'] = round(vif['vif'],2)
vif = vif.sort_values(by='vif',ascending=False)
#vif.drop(vif.index[0],inplace=True)
vif

In [None]:
## hence remove latitude

X_train_sm.drop('latitude',inplace=True,axis=1)

In [None]:
lr4 = sm.OLS(y_train,sm.add_constant(X_train_sm)).fit()
print(lr4.summary())

In [None]:
vif = pd.DataFrame()
vif['features'] = X_train_sm.columns
vif['vif'] = [variance_inflation_factor(X_train_sm.values,i) for i in range(X_train_sm.shape[1])]
vif['vif'] = round(vif['vif'],2)
vif = vif.sort_values(by='vif',ascending=False)
#vif.drop(vif.index[0],inplace=True)
vif

In [None]:
## remove near occean

X_train_sm.drop('NEAR OCEAN',inplace=True,axis=1)

In [None]:
lr5 = sm.OLS(y_train,sm.add_constant(X_train_sm)).fit()
print(lr5.summary())

In [None]:
vif = pd.DataFrame()
vif['features'] = X_train_sm.columns
vif['vif'] = [variance_inflation_factor(X_train_sm.values,i) for i in range(X_train_sm.shape[1])]
vif['vif'] = round(vif['vif'],2)
vif = vif.sort_values(by='vif',ascending=False)
#vif.drop(vif.index[0],inplace=True)
vif

In [None]:
## remove households 
X_train_sm.drop('households',inplace=True,axis=1)

In [None]:
lr6 = sm.OLS(y_train,sm.add_constant(X_train_sm)).fit()
print(lr6.summary())

In [None]:
vif = pd.DataFrame()
vif['features'] = X_train_sm.columns
vif['vif'] = [variance_inflation_factor(X_train_sm.values,i) for i in range(X_train_sm.shape[1])]
vif['vif'] = round(vif['vif'],2)
vif = vif.sort_values(by='vif',ascending=False)
#vif.drop(vif.index[0],inplace=True)
vif

In [None]:
## r2 score of new created model
score2 = r2_score(y_test,lr6.predict(sm.add_constant(X_test[X_train_sm.columns.tolist()])))
score2

* This linear model is slightly better than before .

* I will add more models one by one to improve prediction. To be continued...

# Ridge Regression

In [None]:
## implement ridge regression 

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

ridge = Ridge()
params = {'alpha' :np.arange(0,100,0.01).reshape(10000,1).tolist()}
ridge_grd = GridSearchCV(estimator = ridge,param_grid = params,scoring = 'neg_mean_absolute_error',return_train_score=True).fit(X_train,y_train)

In [None]:
## check the cv results

cv_results_ridge = pd.DataFrame(ridge_grd.cv_results_)

In [None]:
## check results for different fit

cv_results_ridge.head(10)

In [None]:
# plotting mean test and train scoes with alpha 
#cv_results_ridge['param_alpha'] = cv_results_ridge['param_alpha'].astype('int')

# plotting
plt.figure(figsize=(20,10))
plt.plot( cv_results_ridge['mean_train_score'])
plt.plot( cv_results_ridge['mean_test_score'])
plt.grid()
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper right')
plt.show()

In [None]:
## check best params 

ridge_grd.best_score_

In [None]:
ridge_grd.best_estimator_

In [None]:
## train my model with best estimator 

final_ridge = ridge_grd.best_estimator_
final_ridge = final_ridge.fit(X_train,y_train)

In [None]:
## check test score on test data set

score_ridge = r2_score(y_test,final_ridge.predict(X_test))

In [None]:
score_ridge