In [222]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, r2_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder

In [223]:
# Helper Functions
def load_csv(path):
    df = pd.read_csv(path)
    return df

def load_excel(path):
    df = pd.read_excel(path)
    return df

def split(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def scale(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

def one_hot_encode(df,column):
    encoder = OneHotEncoder(sparse_output=False)
    encoder_array = encoder.fit_transform(df[[column]])
    encoder_df = pd.DataFrame(encoder_array, columns=encoder.get_feature_names_out([column]))
    df_encoded = pd.concat([df,encoder_df], axis=1).drop(columns=column)
    return df_encoded

def logistic_regression(X_train, y_train, X_test, y_test):
    predictor = LogisticRegression()
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    return accuracy

def support_vector_classifier(X_train, y_train, X_test, y_test, kernel='rbf'):
    predictor = SVC(kernel=kernel)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    return accuracy

def support_vector_regressor(X_train, y_train, X_test, y_test, kernel='rbf'):
    predictor = SVR(kernel=kernel)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2

def decision_tree_classifier(X_train, y_train, X_test, y_test):
    predictor = DecisionTreeClassifier()
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    return accuracy    

def decision_tree_regressor(X_train, y_train, X_test, y_test):
    predictor = DecisionTreeRegressor()
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2 

def linear_regression(X_train, y_train, X_test, y_test):
    predictor = LinearRegression()
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2

def adaboost_classifier(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = AdaBoostClassifier(n_estimators=n_estimators, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy 

def xgboost_classifier(X_train, y_train, X_test, y_test, n_estimators=200, random_state=42):
    predictor = XGBClassifier(n_estimators=n_estimators, random_state=random_state)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy 

def catboost_classifier(X_train, y_train, X_test, y_test, iterations=200):
    predictor = CatBoostClassifier(iterations=iterations, verbose=0, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy   

def adaboost_regressor(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = AdaBoostRegressor(n_estimators=n_estimators, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    r2 = r2_score(y_test,y_pred)
    return r2 

def xgboost_regressor(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = XGBRegressor(n_estimators=n_estimators, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    r2 = r2_score(y_test,y_pred)
    return r2 

def catboost_regressor(X_train, y_train, X_test, y_test, iterations=200):
    predictor = CatBoostRegressor(iterations=iterations, verbose=0, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    r2 = r2_score(y_test,y_pred)
    return r2 

In [224]:
path = r"G:\Study\iit kharagpur\ML Lab\Lab 1\ML_Class_Tutorial\Datasets\melb_data.csv"

data = load_csv(path)
data.tail(10)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
13570,Wantirna South,34 Fewster Dr,3,h,970000.0,S,Barry,26/08/2017,14.7,3152.0,...,2.0,2.0,674.0,,,,-37.8836,145.22805,Eastern Metropolitan,7082.0
13571,Wantirna South,15 Mara Cl,4,h,1330000.0,S,Barry,26/08/2017,14.7,3152.0,...,2.0,2.0,717.0,191.0,1980.0,,-37.86887,145.22116,Eastern Metropolitan,7082.0
13572,Watsonia,76 Kenmare St,2,h,650000.0,PI,Morrison,26/08/2017,14.5,3087.0,...,1.0,1.0,210.0,79.0,2006.0,,-37.70657,145.07878,Northern Metropolitan,2329.0
13573,Werribee,5 Nuragi Ct,4,h,635000.0,S,hockingstuart,26/08/2017,14.7,3030.0,...,2.0,1.0,662.0,172.0,1980.0,,-37.89327,144.64789,Western Metropolitan,16166.0
13574,Westmeadows,9 Black St,3,h,582000.0,S,Red,26/08/2017,16.5,3049.0,...,2.0,2.0,256.0,,,,-37.67917,144.8939,Northern Metropolitan,2474.0
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,26/08/2017,16.7,3150.0,...,2.0,2.0,652.0,,1981.0,,-37.90562,145.16761,South-Eastern Metropolitan,7392.0
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,26/08/2017,6.8,3016.0,...,2.0,2.0,333.0,133.0,1995.0,,-37.85927,144.87904,Western Metropolitan,6380.0
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,26/08/2017,6.8,3016.0,...,2.0,4.0,436.0,,1997.0,,-37.85274,144.88738,Western Metropolitan,6380.0
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,...,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0
13579,Yarraville,6 Agnes St,4,h,1285000.0,SP,Village,26/08/2017,6.3,3013.0,...,1.0,1.0,362.0,112.0,1920.0,,-37.81188,144.88449,Western Metropolitan,6543.0


### Data Pre-processing

In [225]:
data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [226]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [227]:
data["Suburb"].value_counts()

Suburb
Reservoir         359
Richmond          260
Bentleigh East    249
Preston           239
Brunswick         222
                 ... 
Sandhurst           1
Bullengarook        1
Croydon South       1
Montrose            1
Monbulk             1
Name: count, Length: 314, dtype: int64

In [228]:
data['Type'].value_counts()

Type
h    9449
u    3017
t    1114
Name: count, dtype: int64

In [229]:
data.isna().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [230]:
data['Car'].unique()

array([ 1.,  0.,  2.,  6.,  5.,  4.,  3.,  8.,  7.,  9., 10., nan])

In [231]:
dropped_columns = ['Suburb','Address','Method','SellerG','Date','Regionname','Propertycount','BuildingArea','YearBuilt','CouncilArea','Postcode','Lattitude','Longtitude','Postcode']
data.drop(columns=dropped_columns, axis=1, inplace=True)
data.head()

Unnamed: 0,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize
0,2,h,1480000.0,2.5,2.0,1.0,1.0,202.0
1,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0
2,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0
3,3,h,850000.0,2.5,3.0,2.0,1.0,94.0
4,4,h,1600000.0,2.5,3.0,1.0,2.0,120.0


In [232]:
data = one_hot_encode(data, column='Type')
data.head()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,Type_h,Type_t,Type_u
0,2,1480000.0,2.5,2.0,1.0,1.0,202.0,1.0,0.0,0.0
1,2,1035000.0,2.5,2.0,1.0,0.0,156.0,1.0,0.0,0.0
2,3,1465000.0,2.5,3.0,2.0,0.0,134.0,1.0,0.0,0.0
3,3,850000.0,2.5,3.0,2.0,1.0,94.0,1.0,0.0,0.0
4,4,1600000.0,2.5,3.0,1.0,2.0,120.0,1.0,0.0,0.0


In [233]:
data.dropna(axis=0, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13518 entries, 0 to 13579
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Rooms     13518 non-null  int64  
 1   Price     13518 non-null  float64
 2   Distance  13518 non-null  float64
 3   Bedroom2  13518 non-null  float64
 4   Bathroom  13518 non-null  float64
 5   Car       13518 non-null  float64
 6   Landsize  13518 non-null  float64
 7   Type_h    13518 non-null  float64
 8   Type_t    13518 non-null  float64
 9   Type_u    13518 non-null  float64
dtypes: float64(9), int64(1)
memory usage: 1.1 MB


In [234]:
data.isna().sum()

Rooms       0
Price       0
Distance    0
Bedroom2    0
Bathroom    0
Car         0
Landsize    0
Type_h      0
Type_t      0
Type_u      0
dtype: int64

In [235]:
X = data.drop(['Price'], axis=1)
y = data["Price"]

X_train, X_test, y_train, y_test = split(X,y)
X_train, X_test = scale(X_train, X_test)

In [237]:
# Logistic Regression
acc_lr = linear_regression(X_train, y_train, X_test, y_test)
print(f"The R-square of Linear Regression is {acc_lr:.2f}")

# Decision Tree
acc_dt = decision_tree_regressor(X_train, y_train, X_test, y_test)
print(f"The R-square of Decision Tree is {acc_dt:.2f}")

# Support Vector Machine
acc_svm = support_vector_regressor(X_train, y_train, X_test, y_test, 'poly')
print(f"The R-square of Support Vector Classifier is {acc_svm:.2f}")

# AdaBoost
acc_ada  = adaboost_regressor(X_train, y_train, X_test, y_test)
print(f"The R-square of AdaBoost is {acc_ada:.2f}")

# XGBoost
acc_xg = xgboost_regressor(X_train, y_train, X_test, y_test)
print(f"The R-square of XGBoost is {acc_xg:.2f}")

#CatBoost
acc_cat = catboost_regressor(X_train, y_train, X_test, y_test)
print(f"The R-square of CatBoost is {acc_cat:.2f}")

The R-square of Linear Regression is 0.46
The R-square of Decision Tree is 0.30
The R-square of Support Vector Classifier is -0.08
The R-square of AdaBoost is 0.14
The R-square of XGBoost is 0.65
The R-square of CatBoost is 0.68
