In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, r2_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder
import time

In [103]:
# Helper Functions
def load_csv(path):
    df = pd.read_csv(path)
    return df

def load_excel(path):
    df = pd.read_excel(path)
    return df

def split(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def scale(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

def one_hot_encode(df,column):
    encoder = OneHotEncoder(sparse_output=False)
    encoder_array = encoder.fit_transform(df[[column]])
    encoder_df = pd.DataFrame(encoder_array, columns=encoder.get_feature_names_out([column]))
    df_encoded = pd.concat([df,encoder_df], axis=1).drop(columns=column)
    return df_encoded

def logistic_regression(X_train, y_train, X_test, y_test):
    predictor = LogisticRegression()
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def support_vector_classifier(X_train, y_train, X_test, y_test, kernel='rbf'):
    predictor = SVC(kernel=kernel)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def support_vector_regressor(X_train, y_train, X_test, y_test, kernel='rbf'):
    predictor = SVR(kernel=kernel)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

def decision_tree_classifier(X_train, y_train, X_test, y_test):
    predictor = DecisionTreeClassifier()
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def decision_tree_regressor(X_train, y_train, X_test, y_test):
    predictor = DecisionTreeRegressor()
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

def linear_regression(X_train, y_train, X_test, y_test):
    predictor = LinearRegression()
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

def adaboost_classifier(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = AdaBoostClassifier(n_estimators=n_estimators, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def xgboost_classifier(X_train, y_train, X_test, y_test, n_estimators=200, random_state=42):
    predictor = XGBClassifier(n_estimators=n_estimators, random_state=random_state)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def catboost_classifier(X_train, y_train, X_test, y_test, iterations=200):
    predictor = CatBoostClassifier(iterations=iterations, verbose=0, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime 

def adaboost_regressor(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = AdaBoostRegressor(n_estimators=n_estimators, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime 

def xgboost_regressor(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = XGBRegressor(n_estimators=n_estimators, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

def catboost_regressor(X_train, y_train, X_test, y_test, iterations=200):
    predictor = CatBoostRegressor(iterations=iterations, verbose=0, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

In [104]:
path = r"G:\Study\iit kharagpur\ML Lab\ML_CLASS\datasets\melb_data.csv"

data = load_csv(path)
data.tail(10)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
13570,Wantirna South,34 Fewster Dr,3,h,970000.0,S,Barry,26/08/2017,14.7,3152.0,...,2.0,2.0,674.0,,,,-37.8836,145.22805,Eastern Metropolitan,7082.0
13571,Wantirna South,15 Mara Cl,4,h,1330000.0,S,Barry,26/08/2017,14.7,3152.0,...,2.0,2.0,717.0,191.0,1980.0,,-37.86887,145.22116,Eastern Metropolitan,7082.0
13572,Watsonia,76 Kenmare St,2,h,650000.0,PI,Morrison,26/08/2017,14.5,3087.0,...,1.0,1.0,210.0,79.0,2006.0,,-37.70657,145.07878,Northern Metropolitan,2329.0
13573,Werribee,5 Nuragi Ct,4,h,635000.0,S,hockingstuart,26/08/2017,14.7,3030.0,...,2.0,1.0,662.0,172.0,1980.0,,-37.89327,144.64789,Western Metropolitan,16166.0
13574,Westmeadows,9 Black St,3,h,582000.0,S,Red,26/08/2017,16.5,3049.0,...,2.0,2.0,256.0,,,,-37.67917,144.8939,Northern Metropolitan,2474.0
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,26/08/2017,16.7,3150.0,...,2.0,2.0,652.0,,1981.0,,-37.90562,145.16761,South-Eastern Metropolitan,7392.0
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,26/08/2017,6.8,3016.0,...,2.0,2.0,333.0,133.0,1995.0,,-37.85927,144.87904,Western Metropolitan,6380.0
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,26/08/2017,6.8,3016.0,...,2.0,4.0,436.0,,1997.0,,-37.85274,144.88738,Western Metropolitan,6380.0
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,...,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0
13579,Yarraville,6 Agnes St,4,h,1285000.0,SP,Village,26/08/2017,6.3,3013.0,...,1.0,1.0,362.0,112.0,1920.0,,-37.81188,144.88449,Western Metropolitan,6543.0


### Data Pre-processing

In [105]:
data.shape

(13580, 21)

In [106]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [107]:
data["Suburb"].value_counts()

Suburb
Reservoir         359
Richmond          260
Bentleigh East    249
Preston           239
Brunswick         222
                 ... 
Sandhurst           1
Bullengarook        1
Croydon South       1
Montrose            1
Monbulk             1
Name: count, Length: 314, dtype: int64

In [108]:
data['Type'].value_counts()

Type
h    9449
u    3017
t    1114
Name: count, dtype: int64

In [109]:
data.isna().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [110]:
data['Car'].unique()

array([ 1.,  0.,  2.,  6.,  5.,  4.,  3.,  8.,  7.,  9., 10., nan])

In [111]:
dropped_columns = ['Address','Method','SellerG','Date','Propertycount','BuildingArea','YearBuilt','CouncilArea','Postcode','Lattitude','Longtitude','Postcode']
data.drop(columns=dropped_columns, axis=1, inplace=True)
data.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,Regionname
0,Abbotsford,2,h,1480000.0,2.5,2.0,1.0,1.0,202.0,Northern Metropolitan
1,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,Northern Metropolitan
2,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,Northern Metropolitan
3,Abbotsford,3,h,850000.0,2.5,3.0,2.0,1.0,94.0,Northern Metropolitan
4,Abbotsford,4,h,1600000.0,2.5,3.0,1.0,2.0,120.0,Northern Metropolitan


In [112]:
data = one_hot_encode(data, column='Type')
data = pd.get_dummies(data=data, columns=['Regionname',"Suburb"])
data.head()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,Type_h,Type_t,Type_u,...,Suburb_Williams Landing,Suburb_Williamstown,Suburb_Williamstown North,Suburb_Windsor,Suburb_Wollert,Suburb_Wonga Park,Suburb_Wyndham Vale,Suburb_Yallambie,Suburb_Yarra Glen,Suburb_Yarraville
0,2,1480000.0,2.5,2.0,1.0,1.0,202.0,1.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,2,1035000.0,2.5,2.0,1.0,0.0,156.0,1.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,3,1465000.0,2.5,3.0,2.0,0.0,134.0,1.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,3,850000.0,2.5,3.0,2.0,1.0,94.0,1.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,4,1600000.0,2.5,3.0,1.0,2.0,120.0,1.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [113]:
data.dropna(axis=0, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13518 entries, 0 to 13579
Columns: 332 entries, Rooms to Suburb_Yarraville
dtypes: bool(322), float64(9), int64(1)
memory usage: 5.3 MB


In [114]:
data.isna().sum()

Rooms                  0
Price                  0
Distance               0
Bedroom2               0
Bathroom               0
                      ..
Suburb_Wonga Park      0
Suburb_Wyndham Vale    0
Suburb_Yallambie       0
Suburb_Yarra Glen      0
Suburb_Yarraville      0
Length: 332, dtype: int64

In [115]:
X = data.drop(['Price'], axis=1)
y = data["Price"]

X_train, X_test, y_train, y_test = split(X,y)

numerical_columns = ["Distance", 'Landsize', "Bedroom2", 'Bathroom', "Rooms"]
X_train[numerical_columns], X_test[numerical_columns] = scale(X_train[numerical_columns], X_test[numerical_columns])

In [116]:
# Logistic Regression
acc_lr, runtime = linear_regression(X_train, y_train, X_test, y_test)
print(f"The R-square of Linear Regression is {acc_lr:.2f}")
print(f"The runtime is {runtime:.2f}")

# Decision Tree
acc_dt, runtime = decision_tree_regressor(X_train, y_train, X_test, y_test)
print(f"The R-square of Decision Tree is {acc_dt:.2f}")
print(f"The runtime is {runtime:.2f}")

# Support Vector Machine
acc_svm, runtime = support_vector_regressor(X_train, y_train, X_test, y_test, 'poly')
print(f"The R-square of Support Vector Classifier is {acc_svm:.2f}")
print(f"The runtime is {runtime:.2f}")

# AdaBoost
acc_ada, runtime = adaboost_regressor(X_train, y_train, X_test, y_test)
print(f"The R-square of AdaBoost is {acc_ada:.2f}")
print(f"The runtime is {runtime:.2f}")

# XGBoost
acc_xg, runtime = xgboost_regressor(X_train, y_train, X_test, y_test)
print(f"The R-square of XGBoost is {acc_xg:.2f}")
print(f"The runtime is {runtime:.2f}")

#CatBoost
acc_cat, runtime = catboost_regressor(X_train, y_train, X_test, y_test)
print(f"The R-square of CatBoost is {acc_cat:.2f}")
print(f"The runtime is {runtime:.2f}")

The R-square of Linear Regression is 0.68
The runtime is 0.21
The R-square of Decision Tree is 0.60
The runtime is 0.11
The R-square of Support Vector Classifier is -0.08
The runtime is 16.84
The R-square of AdaBoost is 0.09
The runtime is 4.73
The R-square of XGBoost is 0.79
The runtime is 0.76
The R-square of CatBoost is 0.81
The runtime is 0.82
