In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('darkgrid')
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
%matplotlib inline 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Importing Dataset

In [None]:
df = pd.read_csv("../input/california-housing-prices/housing.csv")

### Glance of the DataSet

In [None]:
df

### Inference:
We can observe that the dataset consists of 20,639 households across 10 different attributes

### Cleaning and Filtering the data

In [None]:
df = df.drop_duplicates() 
df.duplicated().values.any()

In [None]:
df = df.fillna(method="ffill")
pd.isnull(df).any()  # Checking for Null Values

### Feature Engineering

In [None]:
df['avgRooms'] = df['total_rooms'] / df['households']
df['avgBedrooms'] = df['total_bedrooms'] / df['households']
df['pop_per_household'] = df['population'] / df['households']

I have added more number of features which can help with the proper distribution and predicting better values in our model
<li>Average Rooms per House</li>
<li>Average Bedrooms per House</li>
<li>Number of people per household</li>

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.shape

The Dataset is spread across 20640 rows and 13 columns which signify the various attributes

In [None]:
df.columns

The different features of our dataset are:
<ol>
    <li>Longitude</li>
    <li>Latitude</li>
    <li>Median Age of the House</li>
    <li>Total Rooms in the block</li>
    <li>Total Bedrooms in the block</li>
    <li>Population of the block</li>
    <li>Number of Households</li>
    <li>Ocean Proximity</li>
    <li>Number of Rooms per House</li>
    <li>Number of Bedrooms per House</li>
    <li>Number of people per Household</li>
</ol>

## Data Visualization

In [None]:

plt.figure(figsize=(18,10))
plt.scatter(df['latitude'],df['longitude'],c=df['population'], cmap='cool', alpha = 0.8)
plt.colorbar().set_label("Population")
plt.title('Population Magnitude')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.show()

### Inference:
We find that the average population of California is about 1426 per block.
<br>
The Distribution of the population is almost equal across California
<br>
However, the population is more denser in the Northern Part of California

## Average Distribution of Median Price of Housing in a Block

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df.median_house_value, bins = 45, ec = 'black', color = '#f88f01')
plt.xlabel('Median Price of Houses in a block in $', fontsize=16)
plt.ylabel('Number of Houses', fontsize=16)
plt.title('Average Distribution of Median Price of Housing in a Block', fontsize=16)
plt.axvline(df['median_house_value'].mean(), color='#21209c', linestyle='dashed', linewidth=3, label='Average Price of House in a block')
plt.show()
plt.style.use('dark_background')

In [None]:
plt.figure(figsize=(10, 6))
sns.distplot(df.median_house_value, bins = 45, color = '#f88f01', hist = True)
plt.style.use('dark_background')
plt.xlabel('Median Price of Houses in a block in $', fontsize=16)
plt.ylabel('Number of Houses', fontsize=16)
plt.title('Average Distribution of Median Price of Housing in a Block', fontsize=16)
plt.show()

### Inference:
We find that the mean house price is around  $206855.81 $
<br>
However the median value is  $179700.0. $
<br>
We also observe a spike around the $5000000, since the number of houses in that range are quite high. This proves that there are more number of luxury, expensive homes in California.

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df.households , bins=100, ec = 'black', color = '#ff577f')
plt.xlabel('Total Number of Households in a block', fontsize=16)
plt.ylabel('Number of Houses', fontsize=16)
plt.title('Average Distribution of Total Number of Households in a block', fontsize=16)
plt.axvline(df['households'].mean(), color='#21209c', linestyle='dashed', linewidth=3, label='Average Number of Households in a block')
plt.xlim(0,2500)
plt.show()
plt.style.use('dark_background')

In [None]:
plt.figure(figsize=(10, 6))
sns.distplot(df.households , bins=100, color = '#ff577f', hist = False)
plt.xlabel('Total Number of Households in a block', fontsize=16)
plt.ylabel('Number of Houses', fontsize=16)
plt.title('Average Distribution of Total Number of Households in a block', fontsize=16)
plt.xlim(0,2500)

plt.show()
plt.style.use('dark_background')

In [None]:
print(df.households.mean())
print(df.households.median())

### Inference:
 We find that the average number of houses per block is around 500 households. Whereas the median of the data lies at 409 houses per block

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df.avgRooms, bins=100, ec = 'black', color = '#ffcda3')
plt.xlabel('Average Number of Rooms in a House', fontsize=16)
plt.ylabel('Number of Houses', fontsize=16)
plt.title('Average Distribution of Number of Rooms in a room', fontsize=16)
plt.axvline(df['avgRooms'].mean(), color='#21209c', linestyle='dashed', linewidth=3, label='Average Number of Rooms in a House')
plt.xlim(0, 10)
plt.show()
plt.style.use('dark_background')

In [None]:
plt.figure(figsize=(10, 6))
sns.distplot(df.avgRooms, bins=100, color = '#ffcda3', hist = False)
plt.xlabel('Average Number of Rooms in a House', fontsize=16)
plt.ylabel('Number of Houses', fontsize=16)
plt.title('Average Distribution of Number of Rooms in a room', fontsize=16)
plt.xlim(0, 10)
plt.show()
plt.style.use('dark_background')

In [None]:
print(df.avgRooms.mean())
print(df.avgRooms.median())

### Inference:
 The Mean number of rooms per house is between 4 and 5.
 <br>
 The median is also at that range. However there are many houses with 6, 7 and 8 rooms as well.

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df.total_rooms, bins=100, ec = 'black', color = '#00af91')
plt.xlabel('Total Number of Rooms in a block', fontsize=16)
plt.ylabel('Number of Houses', fontsize=16)
plt.axvline(df['total_rooms'].mean(), color='#21209c', linestyle='dashed', linewidth=3, label='Average Number of Rooms in a block')

plt.title('Average Distribution of Total Number of Rooms in a block', fontsize=16)
plt.xlim(0, 12500)
plt.show()
plt.style.use('dark_background')

In [None]:
plt.figure(figsize=(10, 6))
sns.distplot(df.total_rooms, bins = 45, color = '#00af91', hist = False)
plt.xlabel('Total Number of Rooms in a block', fontsize=16)
plt.ylabel('Number of Houses', fontsize=16)
plt.title('Average Distribution of Total Number of Rooms in a block', fontsize=16)
plt.xlim(0, 12500)

plt.show()
plt.style.use('dark_background')

In [None]:
print(df.total_rooms.mean())
print(df.total_rooms.median())

### Inference:
 The Mean number of rooms per block is 2636
 <br>
 The median is at 2127. 

## Ocean Proximity

In [None]:
freq = df.ocean_proximity.value_counts()
print(freq)

In [None]:
freq = df.ocean_proximity.value_counts()
plt.figure(figsize=(10, 6))
plt.bar(freq.index, height = freq,ec='#21209c',color='#008891')
plt.xlabel('Ocean Proximity', fontsize=16)
plt.ylabel('No. of Households', fontsize=16)
plt.show()

Hence, we find that more number of houses(9136) are located at a range that is less than 1 hour to the ocean.
However, there is also a large number of homes inland away from the ocean. Later, we also find that the price of homes near the ocean are compartively higher.

### Feature Engineering: Dummy Variable
I have created a dummy variable inorder to categorize the ocean proximity attribute in the dataset.

In [None]:
dum = pd.get_dummies(df.ocean_proximity)

In [None]:
merged_df = pd.concat([df, dum], axis = 'columns')
merged_df = merged_df.drop(['ocean_proximity', 'ISLAND'], axis= 'columns')
merged_df.head()

### Correlation
### $$ \rho _{XY} = corr(X,Y)$$
### $$ -1.0 \leq \rho _{XY} \leq +1.0$$

In [None]:
X = merged_df.drop('median_house_value', axis= 'columns')
y = merged_df['median_house_value']

In [None]:
plt.figure(figsize=(16, 10))
plt.scatter(x= df['median_house_value'], y=df['avgRooms'], alpha=0.2)
plt.ylim(0, 10)

plt.xlabel('Median Price of the House', fontsize = 14)
plt.ylabel('Average Number of Rooms per household', fontsize = 14)
plt.title('Median Price vs Average Number of Rooms', fontsize = 14)
plt.style.use('dark_background')
plt.show()


In [None]:
plt.figure(figsize=(12,10))
sns.jointplot(x=df['median_house_value'], y=df['avgRooms'], size = 7, color = '#a7c5eb', joint_kws={'alpha': 0.2})
sns.set_style('darkgrid')
plt.ylim(0, 10)
plt.style.use('dark_background')

plt.show()

In [None]:
df.median_house_value.corr(df.avgRooms)

In [None]:
plt.figure(figsize=(16, 10))
plt.scatter(x= df['median_house_value'], y=df['avgBedrooms'], alpha=0.2)
plt.ylim(0, 5)

plt.style.use('dark_background')
plt.xlabel('Median Price of the House', fontsize = 14)
plt.ylabel('Average Number of Bedrooms per household', fontsize = 14)
plt.title('Median Price vs Average Number of Bedrooms', fontsize = 14)
plt.show()

In [None]:
plt.figure(figsize=(16, 10))
sns.jointplot(x=df['median_house_value'], y=df['avgBedrooms'], size = 7, color = '#a7c5eb', joint_kws={'alpha': 0.2})
sns.set_style('darkgrid')
plt.ylim(0, 5)

plt.style.use('dark_background')
plt.xlabel('Median Price of the House', fontsize = 14)
plt.ylabel('Average Number of Bedrooms per household', fontsize = 14)
plt.title('Median Price vs Average Number of Bedrooms', fontsize = 14)
plt.show()

In [None]:
df.median_house_value.corr(df.avgBedrooms)

In [None]:
plt.figure(figsize=(16, 10))
plt.scatter(x= df['median_house_value'], y=df['pop_per_household'], alpha=0.2)
plt.ylim(0, 8)
plt.style.use('dark_background')

plt.xlabel('Median Price of the House', fontsize = 14)
plt.ylabel('Population per Household', fontsize = 14)
plt.title('Median Price vs Population per Household in 10,000s', fontsize = 14)
plt.show()

In [None]:
plt.figure(figsize=(16, 10))
sns.jointplot(x= df['median_house_value'], y=df['pop_per_household'] , size = 7, color = '#a7c5eb', joint_kws={'alpha': 0.2})
plt.ylim(0, 8)
plt.style.use('dark_background')


plt.show()

In [None]:
df.median_house_value.corr(df.pop_per_household)

In [None]:
plt.figure(figsize=(16, 10))
plt.scatter(x= df['median_house_value'], y=df['median_income'], alpha=0.2)
plt.ylim(0, 10)
plt.style.use('dark_background')
plt.xlabel('Median Price of the House', fontsize = 14)
plt.ylabel('Median Income per household', fontsize = 14)
plt.title('Median Price vs Median Income per household in 1000s', fontsize = 14)
plt.show()

In [None]:
plt.figure(figsize=(16, 10))
sns.jointplot(x= df['median_house_value'], y=df['median_income'], size = 7, color = '#a7c5eb', joint_kws={'alpha': 0.2})
plt.ylim(0, 10)
plt.style.use('dark_background')

plt.show()

In [None]:
df.median_house_value.corr(df.median_income)

In [None]:
plt.figure(figsize=(16, 10))
plt.scatter(x= df['median_house_value'], y=df['housing_median_age'], alpha=0.2)
plt.style.use('dark_background')

plt.xlabel('Median Price of the House', fontsize = 14)
plt.ylabel('Median Age of household', fontsize = 14)
plt.title('Median Price vs Median Age of household', fontsize = 14)
plt.show()

In [None]:
plt.figure(figsize=(16, 10))
sns.jointplot(x= df['median_house_value'], y=df['housing_median_age'], size = 7, color = '#a7c5eb', joint_kws={'alpha': 0.2})

plt.style.use('dark_background')

plt.show()

In [None]:
df.median_house_value.corr(df.housing_median_age)

In [None]:
df.corr()

In [None]:
mask = np.zeros_like(df.corr())
triangle_indicies = np.triu_indices_from(mask)
mask[triangle_indicies] = True
mask

In [None]:
plt.figure(figsize=(16, 10))
sns.heatmap(df.corr(), mask=mask, annot=True, annot_kws={"size": 14})
plt.style.use('dark_background')
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.show()

## Training and Test Dataset Split

In [None]:
prices = merged_df['median_house_value']
features = merged_df.drop('median_house_value', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2)

#len(X_train)/len(features)

In [None]:
regr = LinearRegression()
regr.fit(X_train, y_train)

print('Intercept', regr.intercept_)
pd.DataFrame(data = regr.coef_, index=X_train.columns, columns = ['Coef'])

In [None]:
regr.score(X_train, y_train)

In [None]:
regr.score(X_test, y_test)

## We find that our model has an accuracy of around "64.72%"
### We can use our model to predict the value of any house by providing the given attributes with 65% accuracy

Accuracy can further be increased by transforming our model to logarithmic scale

## Data Transformation

In [None]:
df['median_house_value'].skew()

In [None]:
y_log = np.log(df['median_house_value'])
y_log.tail()

In [None]:
plt.figure(figsize=(16, 10))
sns.distplot(y_log)
plt.title(f'Log Price with skew {y_log.skew()}')
plt.show()


Notice the change in skew value after changing meadian house value to log median price value

In [None]:
transformed_df = features
transformed_df['log_median_value'] = y_log

plt.figure(figsize=(16, 10))
sns.lmplot(x = 'median_income', y ='log_median_value', data=transformed_df, size = 7, scatter_kws={'alpha':0.3}, line_kws={'color':'darkred'})
plt.ylim(10.5, 13)
plt.style.use('dark_background')
plt.xlabel('Median Price of the House', fontsize = 14)
plt.ylabel('Median Income per household', fontsize = 14)
plt.title('Median Price vs Median Income per household in 1000s', fontsize = 14)
plt.show()



In [None]:

plt.figure(figsize=(16, 10))
sns.lmplot(x = 'median_income', y ='median_house_value', data=df, size = 7, scatter_kws={'alpha':0.3}, line_kws={'color':'pink'})
plt.ylim(0, 500000)
plt.style.use('dark_background')
plt.xlabel('Median Price of the House', fontsize = 14)
plt.ylabel('Median Income per household', fontsize = 14)
plt.title('Median Price vs Median Income per household in 1000s', fontsize = 14)
plt.show()

## Regression using Log Prices

In [None]:
prices = np.log(df['median_house_value'])
features = merged_df.drop('median_house_value', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2)

regr = LinearRegression()
regr.fit(X_train, y_train)

print('Intercept', regr.intercept_)
pd.DataFrame(data = regr.coef_, index=X_train.columns, columns = ['Coef'])



In [None]:
regr.score(X_train, y_train)

## We find that our model has an accuracy of around "66.72%"
### We can use our model to predict the value of any house by providing the given attributes with 67% accuracy with log price scale