# XGBoost Assignment - Paris Housing

## XGBoost with dataset with 100% accuracy

In [1]:
import pandas as pd

# read data from CSV
df = pd.read_csv('./dataset/ParisHousingClass.csv')
df.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price,category
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5,Basic
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5,Luxury
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1,Basic
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2,Basic
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0,Luxury


In [2]:
# number of rows and columns
df.shape

(10000, 18)

In [3]:
# summary of the data: column names, total no.of non-null values, data types, memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   squareMeters       10000 non-null  int64  
 1   numberOfRooms      10000 non-null  int64  
 2   hasYard            10000 non-null  int64  
 3   hasPool            10000 non-null  int64  
 4   floors             10000 non-null  int64  
 5   cityCode           10000 non-null  int64  
 6   cityPartRange      10000 non-null  int64  
 7   numPrevOwners      10000 non-null  int64  
 8   made               10000 non-null  int64  
 9   isNewBuilt         10000 non-null  int64  
 10  hasStormProtector  10000 non-null  int64  
 11  basement           10000 non-null  int64  
 12  attic              10000 non-null  int64  
 13  garage             10000 non-null  int64  
 14  hasStorageRoom     10000 non-null  int64  
 15  hasGuestRoom       10000 non-null  int64  
 16  price              1000

In [4]:
# summary statistics
df.describe(include='all')

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price,category
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000
unique,,,,,,,,,,,,,,,,,,2
top,,,,,,,,,,,,,,,,,,Basic
freq,,,,,,,,,,,,,,,,,,8735
mean,49870.1312,50.3584,0.5087,0.4968,50.2763,50225.4861,5.5101,5.5217,2005.4885,0.4991,0.4999,5033.1039,5028.0106,553.1212,0.503,4.9946,4993448.0,
std,28774.37535,28.816696,0.499949,0.500015,28.889171,29006.675799,2.872024,2.856667,9.30809,0.500024,0.500025,2876.729545,2894.33221,262.05017,0.500016,3.17641,2877424.0,
min,89.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,1990.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,10313.5,
25%,25098.5,25.0,0.0,0.0,25.0,24693.75,3.0,3.0,1997.0,0.0,0.0,2559.75,2512.0,327.75,0.0,2.0,2516402.0,
50%,50105.5,50.0,1.0,0.0,50.0,50693.0,5.0,5.0,2005.5,0.0,0.0,5092.5,5045.0,554.0,1.0,5.0,5016180.0,
75%,74609.75,75.0,1.0,1.0,76.0,75683.25,8.0,8.0,2014.0,1.0,1.0,7511.25,7540.5,777.25,1.0,8.0,7469092.0,


In [5]:
# check for missing values
df.isna().sum()

squareMeters         0
numberOfRooms        0
hasYard              0
hasPool              0
floors               0
cityCode             0
cityPartRange        0
numPrevOwners        0
made                 0
isNewBuilt           0
hasStormProtector    0
basement             0
attic                0
garage               0
hasStorageRoom       0
hasGuestRoom         0
price                0
category             0
dtype: int64

In [6]:
# check for duplicate values in the dataset
df[df.duplicated()]

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price,category


In [7]:
df['category'].value_counts()

category
Basic     8735
Luxury    1265
Name: count, dtype: int64

In [8]:
# importing LabelEncoder to convert categorical value to numeric
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['category'] = encoder.fit_transform(df['category'])

In [9]:
# spliting the feature and target values
X = df.drop('category', axis=1)
y = df['category']
X.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0


In [10]:
y.head()

0    0
1    1
2    0
3    0
4    1
Name: category, dtype: int32

In [11]:
print(X.shape)
print(y.shape)

(10000, 17)
(10000,)


In [12]:
# 'train_test_split' to split the dataset into Training & Test Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [13]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7500, 17)
(2500, 17)
(7500,)
(2500,)


In [14]:
# importing 'StandardScaler' to rescale the data
from sklearn.preprocessing import StandardScaler
# initialize StandardScaler
scaler = StandardScaler()

# fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)
# transform the testing data
X_test_scaled = scaler.transform(X_test)

In [15]:
X_train_scaled[7499]

array([-1.60850369,  1.16068252,  0.99097407,  1.00883906,  0.78591487,
        1.30583422,  1.20838481, -1.23437844, -0.27623442, -0.99653932,
       -0.9968051 , -0.01339597,  0.72371049,  1.00950158, -1.00427581,
        1.59069655, -1.60721971])

In [16]:
X_test_scaled[2499]

array([ 0.36421822, -0.78365664, -1.00910814,  1.00883906, -1.1141882 ,
       -0.6329542 ,  1.20838481,  0.86252718,  1.65477464,  1.0034727 ,
        1.00320514,  1.16144241,  0.76057246, -1.23547445,  0.9957424 ,
       -0.30364131,  0.36324911])

In [17]:
# importing 'xgboost' and 'XGBClassifier'
import xgboost
from xgboost import XGBClassifier
model = XGBClassifier()

# train the model with training data
model.fit(X_train_scaled, y_train)

In [18]:
# make prediction on test data
y_pred = model.predict(X_test_scaled)
y_pred

array([0, 1, 0, ..., 0, 0, 0])

In [19]:
# evaluate accuracy on the model
from sklearn.metrics import accuracy_score

test_accuracy = accuracy_score(y_test, y_pred)
print(f'The accuracy of Paris HOusing XGBoost model is: {test_accuracy*100:.2f}%')

The accuracy of Paris HOusing XGBoost model is: 100.00%


In [20]:
# evaluate accuracy on trained dataset
train_pred = model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, train_pred)
print(f'The accuracy of Paris HOusing XGBoost trained model is: {train_accuracy*100:.2f}%')

The accuracy of Paris HOusing XGBoost trained model is: 100.00%


In [21]:
# trying different performance metrics
from sklearn.metrics import classification_report, confusion_matrix

classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)

print('Classification matrix: \n', classification_report_result)
print('Confusion matrix: \n', confusion_matrix_result)

Classification matrix: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2168
           1       1.00      1.00      1.00       332

    accuracy                           1.00      2500
   macro avg       1.00      1.00      1.00      2500
weighted avg       1.00      1.00      1.00      2500

Confusion matrix: 
 [[2168    0]
 [   0  332]]


## XGBoost with dataset of 99.88% accuracy

In [22]:
# read data from CSV
data = pd.read_csv('./dataset/ParisHousingClass99.88.csv')
data.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price,category
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5,Basic
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5,Luxury
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1,Basic
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2,Luxury
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0,Luxury


In [23]:
# number of rows and columns
data.shape

(10000, 18)

In [24]:
# summary of the data: column names, total no.of non-null values, data types, memory usage
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   squareMeters       10000 non-null  int64  
 1   numberOfRooms      10000 non-null  int64  
 2   hasYard            10000 non-null  int64  
 3   hasPool            10000 non-null  int64  
 4   floors             10000 non-null  int64  
 5   cityCode           10000 non-null  int64  
 6   cityPartRange      10000 non-null  int64  
 7   numPrevOwners      10000 non-null  int64  
 8   made               10000 non-null  int64  
 9   isNewBuilt         10000 non-null  int64  
 10  hasStormProtector  10000 non-null  int64  
 11  basement           10000 non-null  int64  
 12  attic              10000 non-null  int64  
 13  garage             10000 non-null  int64  
 14  hasStorageRoom     10000 non-null  int64  
 15  hasGuestRoom       10000 non-null  int64  
 16  price              1000

In [25]:
# summary statistics
data.describe()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,49870.1312,50.3584,0.5087,0.4968,50.2763,50225.4861,5.5101,5.5217,2005.4885,0.4991,0.4999,5033.1039,5028.0106,553.1212,0.503,4.9946,4993448.0
std,28774.37535,28.816696,0.499949,0.500015,28.889171,29006.675799,2.872024,2.856667,9.30809,0.500024,0.500025,2876.729545,2894.33221,262.05017,0.500016,3.17641,2877424.0
min,89.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,1990.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,10313.5
25%,25098.5,25.0,0.0,0.0,25.0,24693.75,3.0,3.0,1997.0,0.0,0.0,2559.75,2512.0,327.75,0.0,2.0,2516402.0
50%,50105.5,50.0,1.0,0.0,50.0,50693.0,5.0,5.0,2005.5,0.0,0.0,5092.5,5045.0,554.0,1.0,5.0,5016180.0
75%,74609.75,75.0,1.0,1.0,76.0,75683.25,8.0,8.0,2014.0,1.0,1.0,7511.25,7540.5,777.25,1.0,8.0,7469092.0
max,99999.0,100.0,1.0,1.0,100.0,99953.0,10.0,10.0,2021.0,1.0,1.0,10000.0,10000.0,1000.0,1.0,10.0,10006770.0


In [26]:
# summary statistics for object data types
data.describe(include='O')

Unnamed: 0,category
count,10000
unique,2
top,Basic
freq,7470


In [27]:
# check for missing values
data.isna().sum()

squareMeters         0
numberOfRooms        0
hasYard              0
hasPool              0
floors               0
cityCode             0
cityPartRange        0
numPrevOwners        0
made                 0
isNewBuilt           0
hasStormProtector    0
basement             0
attic                0
garage               0
hasStorageRoom       0
hasGuestRoom         0
price                0
category             0
dtype: int64

In [28]:
# check for duplicate values in the dataset
data[data.duplicated()]

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price,category


In [29]:
data['category'].value_counts()

category
Basic     7470
Luxury    2530
Name: count, dtype: int64

In [30]:
data['category'] = encoder.fit_transform(data['category'])

In [31]:
data['category'].value_counts()

category
0    7470
1    2530
Name: count, dtype: int64

In [32]:
# spliting the feature and target values
A = data.drop('category', axis=1)
b = data['category']
A.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0


In [33]:
b.head()

0    0
1    1
2    0
3    1
4    1
Name: category, dtype: int32

In [34]:
print(A.shape)
print(b.shape)

(10000, 17)
(10000,)


In [35]:
A_train, A_test, b_train, b_test = train_test_split(A, b, test_size=0.25, random_state=40)

In [36]:
print(A_train.shape)
print(A_test.shape)
print(b_train.shape)
print(b_test.shape)

(7500, 17)
(2500, 17)
(7500,)
(2500,)


In [37]:
# fit and transform the training data
A_train_scaled = scaler.fit_transform(A_train)
# transform the testing data
A_test_scaled = scaler.transform(A_test)

In [38]:
# train the model with training data
model_two = XGBClassifier()
model_two.fit(A_train_scaled, b_train)

In [39]:
# make prediction on test data
b_pred = model_two.predict(A_test_scaled)
print(b_pred)

[0 0 1 ... 0 0 0]


In [40]:
# evaluate accuracy on the model
accuracy_test = accuracy_score(b_test, b_pred)
print(f'The accuracy of XGBoost Paris Housing model is: {accuracy_test*100:.2f}%')

The accuracy of XGBoost Paris Housing model is: 99.84%


In [41]:
# evaluate accuracy on trained dataset
training_pred = model_two.predict(A_train_scaled)
accuracy_train = accuracy_score(b_train, training_pred)
print(f'The accuracy of XGBoost Paris Housing training model is: {accuracy_train*100:.2f}%')

The accuracy of XGBoost Paris Housing training model is: 100.00%


In [42]:
# trying different performance metrics
report_classification = classification_report(b_test, b_pred)
report_confusion = confusion_matrix(b_test, b_pred)

print('Classification Report: \n', report_classification)
print('Confusion Report: \n', report_confusion)

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1869
           1       1.00      0.99      1.00       631

    accuracy                           1.00      2500
   macro avg       1.00      1.00      1.00      2500
weighted avg       1.00      1.00      1.00      2500

Confusion Report: 
 [[1869    0]
 [   4  627]]
