### Import useful libraries and load given data

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import math 
import seaborn as sns

# Load Dataset using pandas
df = pd.read_csv('housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


### Handle missing values

In [28]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [29]:
# Lets calculate mean and median for missing values, after which we will decide what average to take in dealing with missing values
mean= df['total_bedrooms'].mean()
median=df['total_bedrooms'].median()
print(mean)
print(median)

537.8705525375618
435.0


In [30]:
# Handling missing values of the feature 'total_bedrooms' with average as median
df['total_bedrooms']=df['total_bedrooms'].fillna(df['total_bedrooms'].median())


In [31]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64

### Encode categorical data

In [32]:
# Here we have only one feature having categorical data
df=pd.get_dummies(df,prefix=['ocean_proximity'])

In [33]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0,0,0,1,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0,0,0,1,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0,0,0,1,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0,0,0,1,0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,0,0,0,1,0


### Split the dataset

In [34]:
# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split

In [35]:
#First divide dependent & independent variable fro dataframe we have 'df'
X= df.drop(['median_house_value'],axis=1)
y=df['median_house_value']


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

### Standardize data

In [37]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)


### Perform Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

### Predict output for test dataset using the fitted model

In [13]:
y_pred= model.predict(X_test)
print(y_pred)

[213812.04682494 287716.04682494 180964.04682494 ...  91492.04682494
 255940.04682494 213604.04682494]


In [14]:
from sklearn.metrics import mean_squared_error 
print(mean_squared_error(y_test,y_pred))

4719931109.150307


### Perform Decision Tree Regression

In [15]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()

In [16]:
tree.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [17]:
y_pred_tree= tree.predict(X_test)

In [18]:
print(mean_squared_error(y_test,y_pred_tree))

4631066223.782461


### Perform Random Forest Reggression


In [38]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor()
forest.fit(X_train,y_train)
y_pred_forest= forest.predict(X_test)

In [20]:
print(mean_squared_error(y_test,y_pred_forest))

2609830569.089552


### Check accuracy of Models

In [39]:
#For Linear Reggression
sk.metrics.r2_score(y_test,y_pred)

0.6380295744299517

In [40]:
# For Decision Tree Reggression
sk.metrics.r2_score(y_test,y_pred_tree)

0.6448446019443306

In [26]:
# For Random Forest Reggression
sk.metrics.r2_score(y_test,y_pred_forest)

0.7998526970176202