# ***Predicting the price of the house in California***

In [123]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

***Loading of data***

In [124]:
df = pd.read_csv(r"F:\Self Learning\Machine Learning\Codebasics\ML\16_regularization\Melbourne_housing_FULL.csv")
pd.set_option("display.max_columns", None)
df.sample(2)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
16284,Glen Waverley,4 Kalonga Ct,5,h,,PI,Biggin,24/06/2017,16.7,3150.0,5.0,5.0,2.0,705.0,127.0,1970.0,Monash City Council,-37.8932,145.16526,Eastern Metropolitan,15321.0
10985,Yarraville,7 Hughes St,3,h,840000.0,S,Village,28/08/2016,7.0,3013.0,4.0,1.0,1.0,470.0,127.0,1970.0,Maribyrnong City Council,-37.822,144.8904,Western Metropolitan,6543.0


***Columns to be used in our dataframe***

In [125]:
columns_to_be_used = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 
               'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']
df = df[columns_to_be_used]

In [126]:
df.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,0.0,,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0


In [127]:
df.shape

(34857, 15)

In [128]:
df.isnull().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        3
Distance             1
CouncilArea          3
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

***Here, we are replacing the null values from the specific columns***

In [129]:
fill_null_values = ["Bedroom2", "Bathroom", "Car", "Landsize", "BuildingArea"]
for i in fill_null_values:
    df[i].fillna(0, inplace=True)

***In the below code, we can see that there are still some null values in price column and we know that we cannot interpolate the target values in the columns. So, we are foing to drop the all the null values***

In [130]:
df.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Regionname          3
Propertycount       3
Distance            1
CouncilArea         3
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price            7610
dtype: int64

In [131]:
df.dropna(inplace=True)

In [132]:
df.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64

In [133]:
df.shape

(27244, 15)

In [134]:
df.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,0.0,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0
5,Abbotsford,3,h,PI,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,94.0,0.0,850000.0
6,Abbotsford,4,h,VB,Nelson,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,1.0,2.0,120.0,142.0,1600000.0


***In the code below, we are going to change the categorical columns into the numberical values using label encoders.***

In [135]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

s = (df.dtypes == "object")
object_columns = list(s[s].index)

data_copy = df.copy()
for col in object_columns:
    data_copy[col] = le.fit_transform(data_copy[col].astype(str))

In [136]:
list(s[s].index)

['Suburb', 'Type', 'Method', 'SellerG', 'Regionname', 'CouncilArea']

In [137]:
data_copy.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
1,0,2,0,1,32,2,4019.0,2.5,31,2.0,1.0,1.0,202.0,0.0,1480000.0
2,0,2,0,1,32,2,4019.0,2.5,31,2.0,1.0,0.0,156.0,79.0,1035000.0
4,0,3,0,3,32,2,4019.0,2.5,31,3.0,2.0,0.0,134.0,150.0,1465000.0
5,0,3,0,0,32,2,4019.0,2.5,31,3.0,2.0,1.0,94.0,0.0,850000.0
6,0,4,0,4,206,2,4019.0,2.5,31,3.0,1.0,2.0,120.0,142.0,1600000.0


***Here, we are going to split our data set into two parts that is dependent variable(Predicted vriable) and independent variable***

In [138]:
X = data_copy.drop("Price", axis=1)
y = data_copy.Price

***Now, we are going to split our data in the traininng data and the testing data***

In [139]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

***We are going to train our model using the training data***

In [140]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X_train, y_train)

In [141]:
regression.score(X_train, y_train)

0.43387384266508233

In [142]:
regression.score(X_test, y_test)

0.42368867560856016

## ***In this case, our model is the underfitted model because both the training score and the testing scores are less***

***Here we can see that the both training and testing accuracy both is low so, we can saty that our model is underfit. Hence we will try do perform ridge and lasso regression to ivercome the issue***

In [144]:
from sklearn.linear_model import Lasso
lasso_rig = Lasso(alpha=50, max_iter=100, tol=0.1)
lasso_rig.fit(X_train, y_train)

In [148]:
lasso_rig.score(X_train, y_train)

0.4338737231555141

In [146]:
lasso_rig.score(X_test, y_test)

0.4237081362810099

In [147]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_reg.fit(X_train, y_train)

In [149]:
ridge_reg.score(X_train, y_train)

0.43386300687283597

In [150]:
ridge_reg.score(X_test, y_test)

0.42388441641793373

# ***Here, we can see that there is no change in the score after applying the Lasso and Ridge regression***

Normally, Lasso is used to remove the features that are less related and Ridge is used to prevent the overfitting of the model