This is a set of data created from imaginary data of house prices in an urban environment - Paris. There are 2 files, one is with 100% of correctly classified instances - ParisHousing.csv, the other one is with 99.88% and it's name is ParisHousing99.88.csv. This is a binary classification problem with two classes - basic, luxury. Given data about a particular house, we have to predict whether it is a basic or a luxury house.

I will use this toy data to evaluate various classification models and compare them against a three-layer neural network.

# Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow
import keras
from keras import (layers, models, regularizers)
from keras import backend as K

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Load data

In [None]:
data1 = pd.read_csv("/kaggle/input/paris-housing-classification/ParisHousingClass.csv")
data2 = pd.read_csv("/kaggle/input/paris-housing-classification/ParisHousingClass99.88.csv")

In [None]:
print(data1.shape)
print(data2.shape)

In [None]:
data1.head()

In [None]:
data2.head()

In [None]:
(data1.columns == data2.columns).all() # checking if columns are equal

In [None]:
print(data1.isna().sum().sum()) # checking for NaN values
print(data2.isna().sum().sum())

Concatenating the two dataframes along index

In [None]:
data = pd.concat([data1,data2]).reset_index(drop=True)
data.shape

# Encoding target values

In [None]:
category = {'Basic':0, 'Luxury': 1}
data['category'] = data['category'].map(category)

# Distribution of target class

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data.category)
plt.title('Class distribution')
plt.show()

# Correlation between features and target

In [None]:
corr = data.corr()
plt.figure(figsize=(15,8))
sns.heatmap(corr)
plt.show()

## No of rooms

In [None]:
sns.distplot(data['numberOfRooms'])
plt.show()

## Distribution of year of manufacture

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data['made'])
plt.xticks(rotation=50)
plt.show()

## Distribution of price

In [None]:
sns.distplot(data['price'])

## Normalizing the numeric features

In [None]:
numeric_features = ['squareMeters', 'numberOfRooms', 'floors',
       'cityCode', 'cityPartRange', 'numPrevOwners', 'made', 'basement', 'attic', 'garage', 'hasGuestRoom', 'price']

In [None]:
for feat in numeric_features:
    mean = data[feat].mean()
    std = data[feat].std()
    data[feat] -= mean
    data[feat] /= std

# Splitting data into train, validation and test sets

In [None]:
data = data.sample(frac=1).reset_index(drop=True)

test_frac = .15
val_frac = .15

test_data = data[len(data)-int(len(data)*test_frac):len(data)]
val_data = data[test_data.index[0]-int(len(data)*val_frac):test_data.index[0]]
train_data = data[:val_data.index[0]]

In [None]:
train_labels = train_data['category']
train_data = train_data.iloc[:,:-1]
val_labels = val_data['category']
val_data = val_data.iloc[:,:-1]
test_labels = test_data['category']
test_data = test_data.iloc[:,:-1]

# Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(train_data, train_labels)
y_pred_val = lr.predict(val_data)
np.round(f1_score(val_labels, y_pred_val),4)

Confusion Matrix

In [None]:
confusion_matrix(val_labels, y_pred_val)

# XGBoost

In [None]:
xg = XGBClassifier()
xg.fit(train_data, train_labels)
y_pred_val = xg.predict(val_data)
np.round(f1_score(val_labels, y_pred_val),4)

# Neural Network

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.001), input_shape=(17, )))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='softmax'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy',  metrics=['acc',f1_m, precision_m, recall_m])

In [None]:
history = model.fit(train_data, train_labels, epochs=100, batch_size=128, validation_data=(val_data, val_labels), verbose=0)

In [None]:
y_pred_val = model.predict(val_data)
np.round(f1_score(val_labels, y_pred_val),4)

# Support Vector Machines

In [None]:
sv = SVC()
sv.fit(train_data, train_labels)
y_pred_val = sv.predict(val_data)
np.round(f1_score(val_labels, y_pred_val),4)

# Random Forest Classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(train_data, train_labels)
y_pred_val = rf.predict(val_data)
np.round(f1_score(val_labels, y_pred_val),4)

# Prediction on Test data

In [None]:
y_lr = lr.predict(test_data) #logistic regression
y_sv = sv.predict(test_data) #support vector machines
y_rf = rf.predict(test_data) #random forest
y_xg = xg.predict(test_data) #xgboost
y_nn = model.predict(test_data) #neural network

In [None]:
print('\nLogical Regression f1-score:\n')
print(np.round(f1_score(test_labels, y_lr),4))
print('\nSVM f1-score:\n')
print(np.round(f1_score(test_labels, y_sv),4))
print('\nRandom f1-score:\n')
print(np.round(f1_score(test_labels, y_rf),4))
print('\nXGBoost f1-score:\n')
print(np.round(f1_score(test_labels, y_xg),4))
print('\nNN f1-score:\n')
print(np.round(f1_score(test_labels, y_nn),4))

The simple neural network is the worst performer. Linear models have outperformed non-linear models. 