In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### About Dataset

longitude: A measure of how far west a house is; a higher value is farther west

latitude: A measure of how far north a house is; a higher value is farther north

housing_median_age: Median age of a house within a block; a lower number is a newer building

total_rooms: Total number of rooms within a block

total_bedrooms: Total number of bedrooms within a block

population: Total number of people residing within a block

households: Total number of households, a group of people residing within a home unit, for a block

median_income: Median income for households within a block of houses (measured in tens of thousands of US Dollars)

ocean_proximity: Location of the house w.r.t ocean/sea

median_house_value: Median house value for households within a block (measured in US Dollars)

## ***Loading Dataset***

In [None]:
df=pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')
df.head()

## ***Importing M[](http://)odules***

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## ***Pre-Processing***

In [None]:
df.info()
# only total_bedrooms have null values

In [None]:
mode=df['total_bedrooms'].mode()[0]
df['total_bedrooms'].fillna(mode,inplace=True)
# fill the bedrooms with most occurence of frequency of rooms bcoz mean value will be a decimal value and i dont wanted that 
any(df['total_bedrooms'].isnull())
#False shows there are no missing values left in total_bedrooms

In [None]:
df['ocean_proximity'].value_counts()

In [None]:
label=LabelEncoder()
df['ocean_proximity']=label.fit_transform(df['ocean_proximity'])
df.head(5)

## ***Understanding The Dataset***

In [None]:
df.describe()

In [None]:
figure, (ax1,ax2) = plt.subplots(1,2,figsize=(10,5))

df['housing_median_age'].plot(kind='hist',ax=ax1)

df['housing_median_age'].plot(kind='box', ax=ax2)

plt.tight_layout()

plt.show()

It shows absence of any outliers.

It also shows that 50% of houses were built within 20 - 40 years of gap.

It also shows nowadays only fewer houses are being built within that loacality.

In [None]:
df.plot(kind='scatter',x='longitude',y='latitude')

plt.show()

The shape of the above graph shows the shape of the California housing land present for new buyers.The bluezone shows residential area and white zone is non reidential area.

In [None]:
figure, (ax1,ax2) = plt.subplots(1,2,figsize=(10,5))

df.plot(kind='scatter',x='median_house_value',y='total_rooms',ax=ax1)

df.plot(kind='scatter',x='median_house_value',y='total_bedrooms', ax=ax2)

plt.tight_layout()

plt.show()

The above graph shows we can't take only total_bedrooms or total_rooms for price hike of houses

## ***Correlation***

In [None]:
corr_matrix=df.corr()

f, ax = plt.subplots(figsize=(11, 15))

heatmap = sns.heatmap(corr_matrix,
                      mask = np.triu(corr_matrix),
                      square = True,
                      linewidths = .5,
                      cmap ='coolwarm', 
                      cbar_kws = {'shrink': .4,'ticks' : [-1, -.5, 0, 0.5, 1]},
                      vmin = -1,
                      vmax = 1,
                      annot = True,
                      annot_kws = {"size": 12})

#add the column names as labels
ax.set_yticklabels(corr_matrix.columns, rotation = 0)
ax.set_xticklabels(corr_matrix.columns)

sns.set_style({'xtick.bottom': True}, {'ytick.left': True})

It shows how much one feature depends on other feature.

# ***Model***

#### Model 1:Linear Regression

In [None]:
input_values = df[['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','ocean_proximity']]
output_values = df[['median_house_value']]

train_input,test_input,train_output,test_output=train_test_split(input_values,output_values,test_size=0.1,random_state=32)

In [None]:
model=LinearRegression()
model.fit(train_input,train_output)
model.score(test_input,test_output)

Model score is only 63.89 % so we will try next model to raise its accuracy

#### Model 2:Support Vector Regressor

In [None]:
input_values = df[['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','ocean_proximity']]
output_values = df[['median_house_value']]

sc=StandardScaler()
input_values=sc.fit_transform(input_values)
output_values=sc.fit_transform(output_values)

train_input,test_input,train_output,test_output=train_test_split(input_values,output_values,test_size=0.1,random_state=32)

In [None]:
model_svr=SVR(kernel='rbf',degree=2,C=10,verbose=3)
model_svr.fit(train_input,train_output)
model_svr.score(test_input,test_output)

The accuracy for this model is 78.52% which is better than previous model.

#### Model 3:Gradient Boosting Regressor

In [None]:
input_values = df[['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','ocean_proximity']]
output_values = df[['median_house_value']]

sc=StandardScaler()
input_values=sc.fit_transform(input_values)
output_values=sc.fit_transform(output_values)

train_input,test_input,train_output,test_output=train_test_split(input_values,output_values,test_size=0.1,random_state=32)

In [None]:
model_grad=GradientBoostingRegressor(max_depth= 8, max_features=6, min_samples_split=200, n_estimators=100,random_state=42)
model_grad.fit(train_input,train_output)
model_grad.score(test_input,test_output)

The accuracy of this model with each given feature parameter is 83.45% which is good.