In [None]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the california housing dataset

housing_data = pd.read_csv("/kaggle/input/california-housing-prices/housing.csv")

In [None]:
# See data

housing_data.head()

In [None]:
# See sample view of data for different type of values for categorical features

housing_data.sample(6)

In [None]:
# Describe the dataset

housing_data.describe()

In [None]:
# Check the shape of data

housing_data.shape

In [None]:
# Drop missing records and check the shape

housing_data = housing_data.dropna()
housing_data.shape

Approx 200 records were dropped which were having missing values

In [None]:
# Again use describe dataset

housing_data.describe()

It looks that missing records does not impact the describe() output on dataset.

In [None]:
# Check for non-numerical column once

housing_data['ocean_proximity'].unique()

In [None]:
# Total rooms V/S Median house value

fig, ax = plt.subplots(figsize=(12, 8))
plt.scatter(housing_data['total_rooms'], housing_data['median_house_value'])
plt.xlabel('Total Rooms')
plt.ylabel('Median House Value')

In [None]:
# Total median income V/S Median house value

fig, ax = plt.subplots(figsize=(12, 8))
plt.scatter(housing_data['median_income'], housing_data['median_house_value'])
plt.xlabel('Median Income')
plt.ylabel('Median House Value')

Median Income and Median house value seems to be directly proportional that is if income increases then median price of the house also increases.
This is positively corelated.
But it is also important to note that there a cap at median house value of 500000. 

In [None]:
# Check for corealtion with each features

housing_data_corr = housing_data.corr()
housing_data_corr

If 2 variables move in same direction then they are said to be positively co-related and if they move in opposite direction
then they are said to be negatively co-related.

In [None]:
# Visualize the corelation with help of heatmap

fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(housing_data_corr, annot=True)

In [None]:
# We saw that at 500000 median house value there were some upper cap. Lets count those value

housing_data.loc[housing_data['median_house_value'] >= 500000].count()

In [None]:
# Drop these records

housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] >= 500000].index)

In [None]:
# check the shape

housing_data.shape

In [None]:
housing_data.head()

In [None]:
# Convert categorical col into numerical value

housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])

In [None]:
housing_data.shape

In [None]:
housing_data.sample(5)

In [None]:
X = housing_data.drop('median_house_value', axis=1)
Y = housing_data['median_house_value']
X.columns

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression(normalize=True).fit(x_train, y_train)


In [None]:
print("Training Score : ", linear_model.score(x_train, y_train))

In [None]:
predictors = x_train.columns
predictors

In [None]:
coef = pd.Series(linear_model.coef_, predictors).sort_values()
print(coef)

Negative values shows that it contributed to lower price of the house while positive values suggests that they are the 
one who contributed in higher price value of the house

In [None]:
# Now we have liner model let pridict now :)

y_pred = linear_model.predict(x_test)

In [None]:
# how our model performed, Create a dataframe and check the y_pred and actual values

df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': y_test})
df_pred_actual.head(10)

In [None]:
# the r2_score on pred
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("Testing score : ", r2_score(y_test, y_pred))

In [None]:
# Scatter plot between actual and predicted values

fig, ax = plt.subplots(figsize=(12, 8))
plt.scatter(y_test, y_pred)

In [None]:
df_pred_actual_sample = df_pred_actual.sample(100)
df_pred_actual_sample = df_pred_actual_sample.reset_index()
df_pred_actual_sample.head()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.plot(df_pred_actual_sample['predicted'], label='Predicted')
plt.plot(df_pred_actual_sample['actual'], label='Actual')
plt.ylabel('Median House Value')
plt.legend()
plt.show()
