In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression , Lasso, Ridge

In [2]:
df = pd.read_csv("Melbourne_housing_FULL.csv")
df.head(3)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0


In [3]:
df.nunique()

Suburb             351
Address          34009
Rooms               12
Type                 3
Price             2871
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom2            15
Bathroom            11
Car                 15
Landsize          1684
BuildingArea       740
YearBuilt          160
CouncilArea         33
Lattitude        13402
Longtitude       14524
Regionname           8
Propertycount      342
dtype: int64

In [4]:
df.shape

(34857, 21)

In [5]:
col_to_use = ["Suburb", "Rooms", "Type", "Method", "SellerG", "Regionname", "Propertycount", "Distance", "CouncilArea", "Bedroom2", "Bathroom", "Car", "Landsize", "BuildingArea", "Price" ]
df = df[col_to_use]
df.head(3)

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0


In [6]:
df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        3
Distance             1
CouncilArea          3
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

In [7]:
cols_to_fill_zero = ["Propertycount", "Distance", "Bedroom2", "Bathroom", "Car"]
df[cols_to_fill_zero] = df[cols_to_fill_zero].fillna(value = 0)
df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        0
Distance             0
CouncilArea          3
Bedroom2             0
Bathroom             0
Car                  0
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

In [8]:
df["Landsize"] = df["Landsize"].fillna(df["Landsize"].mean())
df["BuildingArea"] = df["Landsize"].fillna(df["Landsize"].mean())

In [9]:
df.dropna(inplace = True)
df.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64

In [10]:
df = pd.get_dummies(df, drop_first = True)
df.head(3)

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2,4019.0,2.5,2.0,1.0,1.0,202.0,202.0,1480000.0,False,...,False,False,False,False,False,False,False,False,True,False
2,2,4019.0,2.5,2.0,1.0,0.0,156.0,156.0,1035000.0,False,...,False,False,False,False,False,False,False,False,True,False
4,3,4019.0,2.5,3.0,2.0,0.0,134.0,134.0,1465000.0,False,...,False,False,False,False,False,False,False,False,True,False


In [11]:
X = df.drop("Price", axis = 1)
y = df["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)


In [12]:
model = LinearRegression()
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
score

0.674755608118354

In [13]:
model.score(X_train, y_train)

0.6789573300878368

In [14]:
lasso_reg = Lasso(alpha = 50, max_iter = 1000, tol = 0.1)
lasso_reg.fit(X_train, y_train)

In [15]:
score = lasso_reg.score(X_test, y_test)
score

0.677819767395418

In [16]:
lasso_reg.score(X_train, y_train)

0.6747938769920818

In [17]:
ridge_reg = Ridge(alpha = 50, max_iter = 1000, tol = 0.1)
ridge_reg.fit(X_train, y_train)

In [18]:
score = ridge_reg.score(X_test, y_test)
score

0.6712051031791715

In [19]:
ridge_reg.score(X_train, y_train)

0.6631988324137128

In [20]:
from sklearn.model_selection import GridSearchCV
model_params = {
    "LinearRegression":{
        "model":LinearRegression(),
        "params":{}
    },
     "RidgeRegression":{
        "model":Ridge(max_iter = 10000),
        "params":{
            "tol":[0.1, 0.25, 0.5, 0.75, 1],
            "alpha":[10, 25, 50, 75, 100]
        }
    },
     "LassoRegression":{
        "model":Lasso(max_iter = 10000),
        "params":{
            "tol":[0.1, 0.25, 0.5, 0.75, 1],
            "alpha":[10, 25, 50, 75, 100]
        }
    }

    
}
score = []
for model_name, mp in model_params.items():
    grid_search = GridSearchCV(mp["model"], mp["params"], cv = 5, return_train_score=False)
    grid_search.fit(X, y)
    score.append({"model_name":[model_name], "best_score":[grid_search.best_score_], "best_params":[grid_search.best_params_]})
df = pd.DataFrame(score, columns = ["model_name", "best_score", "best_params"])

In [21]:
df

Unnamed: 0,model_name,best_score,best_params
0,[LinearRegression],[-6710510830.659923],[{}]
1,[RidgeRegression],[0.6532929539322745],"[{'alpha': 10, 'tol': 0.1}]"
2,[LassoRegression],[0.6526400593398922],"[{'alpha': 25, 'tol': 0.1}]"
