## About Features

All attributes are numeric variables and they are listed bellow:

- squareMeters
- numberOfRooms
- hasYard
- hasPool
- floors : number of floors
- cityCode : zip code
- cityPartRange : the higher the range, the more exclusive the neighbourhood is
- numPrevOwners : number of prevoious owners
- made : year
- isNewBuilt
- hasStormProtector
- basement : basement square meters
- attic : attic square meteres
- garage : garage size
- hasStorageRoom
- hasGuestRoom : number of guest rooms
- price : price of a house
- category : Luxury or Basic

**Our task is to predict the 'category'**

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/paris-housing-classification/ParisHousingClass.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

Good! We have no null values.

In [None]:
df.describe()

# EDA

In [None]:
background_color = '#F8EDF4'

### Countplot of Target Feature (Category)

In [None]:
fig = plt.figure(figsize=(14, 6))
gs = fig.add_gridspec(1, 2)
gs.update(wspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])

axes = [ax0, ax1]
fig.patch.set_facecolor(background_color)


# Title
ax0.text(0.5, 0.5, 'Countplot of Category\n____________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontfamily='serif', fontweight='bold')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['bottom'].set_visible(False)


# Graph
sns.countplot(x='category', data=df, ax=ax1, palette='spring_r')
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax1.set_xlabel('')
ax1.set_ylabel('')

# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

In [None]:
df['category'].value_counts()

### Countplots of Categorical Features

In [None]:
fig = plt.figure(figsize=(16, 5))
gs = fig.add_gridspec(1, 3)
gs.update(hspace=0.2, wspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])

axes = [ax0, ax1, ax2]
fig.patch.set_facecolor(background_color)


# Title
ax0.text(0.5, 0.5, "Countplot of 'hasYard'\n_________________",
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['bottom'].set_visible(False)

# Graph1
sns.countplot(x='hasYard', data=df, ax=ax1, palette='spring_r')
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax1.set_xlabel('')
ax1.set_ylabel('')

# Graph2
sns.countplot(x='hasYard', data=df, hue='category', ax=ax2, palette='spring_r')
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax2.set_xlabel('')
ax2.set_ylabel('')


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

In [None]:
pd.crosstab(df['category'], df['hasYard'], margins=True).style.background_gradient(cmap="Wistia")

In [None]:
fig = plt.figure(figsize=(16, 5))
gs = fig.add_gridspec(1, 3)
gs.update(hspace=0.2, wspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])

axes = [ax0, ax1, ax2]
fig.patch.set_facecolor(background_color)


# Title
ax0.text(0.5, 0.5, "Countplot of 'hasPool'\n_________________",
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['bottom'].set_visible(False)

# Graph1
sns.countplot(x='hasPool', data=df, ax=ax1, palette='spring_r')
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax1.set_xlabel('')
ax1.set_ylabel('')

# Graph2
sns.countplot(x='hasPool', data=df, hue='category', ax=ax2, palette='spring_r')
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax2.set_xlabel('')
ax2.set_ylabel('')


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

In [None]:
pd.crosstab(df['category'], df['hasPool'], margins=True).style.background_gradient(cmap='Wistia')

In [None]:
fig = plt.figure(figsize=(16, 5))
gs = fig.add_gridspec(1, 3)
gs.update(hspace=0.2, wspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])

axes = [ax0, ax1, ax2]
fig.patch.set_facecolor(background_color)


# Title
ax0.text(0.5, 0.5, "Countplot of 'cityPartRange'\n_________________",
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['bottom'].set_visible(False)

# Graph1
sns.countplot(x='cityPartRange', data=df, ax=ax1, palette='spring_r')
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax1.set_xlabel('')
ax1.set_ylabel('')

# Graph2
sns.countplot(x='cityPartRange', data=df, hue='category', ax=ax2, palette='spring_r')
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax2.set_xlabel('')
ax2.set_ylabel('')


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

In [None]:
pd.crosstab(df['category'], df['cityPartRange'], margins=True).style.background_gradient(cmap="Wistia")

In [None]:
fig = plt.figure(figsize=(16, 5))
gs = fig.add_gridspec(1, 3)
gs.update(hspace=0.2, wspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])

axes = [ax0, ax1, ax2]
fig.patch.set_facecolor(background_color)


# Title
ax0.text(0.5, 0.5, "Countplot of 'numPrevOwners'\n_________________",
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['bottom'].set_visible(False)

# Graph1
sns.countplot(x='numPrevOwners', data=df, ax=ax1, palette='spring_r')
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax1.set_xlabel('')
ax1.set_ylabel('')

# Graph2
sns.countplot(x='numPrevOwners', data=df, hue='category', ax=ax2, palette='spring_r')
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax2.set_xlabel('')
ax2.set_ylabel('')


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

In [None]:
pd.crosstab(df['category'], df['numPrevOwners'], margins=True).style.background_gradient(cmap="Wistia")

In [None]:
fig = plt.figure(figsize=(16, 5))
gs = fig.add_gridspec(1, 3)
gs.update(hspace=0.2, wspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])

axes = [ax0, ax1, ax2]
fig.patch.set_facecolor(background_color)


# Title
ax0.text(0.5, 0.5, "Countplot of 'isNewBuilt'\n_________________",
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['bottom'].set_visible(False)

# Graph1
sns.countplot(x='isNewBuilt', data=df, ax=ax1, palette='spring_r')
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax1.set_xlabel('')
ax1.set_ylabel('')

# Graph2
sns.countplot(x='isNewBuilt', data=df, hue='category', ax=ax2, palette='spring_r')
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax2.set_xlabel('')
ax2.set_ylabel('')


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

In [None]:
pd.crosstab(df['category'], df['isNewBuilt'], margins=True).style.background_gradient(cmap="Wistia")

In [None]:
fig = plt.figure(figsize=(16, 5))
gs = fig.add_gridspec(1, 3)
gs.update(hspace=0.2, wspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])

axes = [ax0, ax1, ax2]
fig.patch.set_facecolor(background_color)


# Title
ax0.text(0.5, 0.5, "Countplot of 'hasStormProtector'\n_________________",
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['bottom'].set_visible(False)

# Graph1
sns.countplot(x='hasStormProtector', data=df, ax=ax1, palette='spring_r')
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax1.set_xlabel('')
ax1.set_ylabel('')

# Graph2
sns.countplot(x='hasStormProtector', data=df, hue='category', ax=ax2, palette='spring_r')
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax2.set_xlabel('')
ax2.set_ylabel('')


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

In [None]:
pd.crosstab(df['category'], df['hasStormProtector'], margins=True).style.background_gradient(cmap="Wistia")

In [None]:
fig = plt.figure(figsize=(16, 5))
gs = fig.add_gridspec(1, 3)
gs.update(hspace=0.2, wspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])

axes = [ax0, ax1, ax2]
fig.patch.set_facecolor(background_color)


# Title
ax0.text(0.5, 0.5, "Countplot of 'hasStorageRoom'\n_________________",
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['bottom'].set_visible(False)

# Graph1
sns.countplot(x='hasStorageRoom', data=df, ax=ax1, palette='spring_r')
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax1.set_xlabel('')
ax1.set_ylabel('')

# Graph2
sns.countplot(x='hasStorageRoom', data=df, hue='category', ax=ax2, palette='spring_r')
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax2.set_xlabel('')
ax2.set_ylabel('')


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

In [None]:
pd.crosstab(df['category'], df['hasStorageRoom'], margins=True).style.background_gradient(cmap="Wistia")

In [None]:
fig = plt.figure(figsize=(16, 5))
gs = fig.add_gridspec(1, 3)
gs.update(hspace=0.2, wspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])

axes = [ax0, ax1, ax2]
fig.patch.set_facecolor(background_color)


# Title
ax0.text(0.5, 0.5, "Countplot of 'hasGuestRoom'\n_________________",
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontweight='bold', fontfamily='serif')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['bottom'].set_visible(False)

# Graph1
sns.countplot(x='hasGuestRoom', data=df, ax=ax1, palette='spring_r')
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax1.set_xlabel('')
ax1.set_ylabel('')

# Graph2
sns.countplot(x='hasGuestRoom', data=df, hue='category', ax=ax2, palette='spring_r')
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
ax2.set_xlabel('')
ax2.set_ylabel('')


# Settings
for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

In [None]:
pd.crosstab(df['category'], df['hasGuestRoom'], margins=True).style.background_gradient(cmap="Wistia")

What I can see from the graphs:


- All of the categorical features are quite balanced
- Every 'Luxury' house has Yard and Pool.

### Distributions of Continuous Features

In [None]:
cont_features = ['squareMeters', 'numberOfRooms', 'floors', 'cityPartRange', 'numPrevOwners', 'made', 'basement', 'attic', 'garage', 'hasGuestRoom', 'price']

In [None]:
# I plotted them in two cells because the code becomes TOO LONG

fig = plt.figure(figsize=(18, 25))
gs = fig.add_gridspec(4, 3)
gs.update(wspace=0.3, hspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
ax3 = fig.add_subplot(gs[1, 0])
ax4 = fig.add_subplot(gs[1, 1])
ax5 = fig.add_subplot(gs[1, 2])

axes = [ax0, ax1, ax2, ax3, ax4, ax5]
fig.patch.set_facecolor(background_color)


# Title
ax0.text(0.5, 0.5, 'Distribution of Continuous Features\n by Category\n ___________________\n',
        fontsize=18, fontfamily='serif', fontweight='bold',
        horizontalalignment='center',
        verticalalignment='center')
 
ax0.text(0.5, 0.3, 'Orange : Basic\n Red : Luxury',
        fontsize=14, fontfamily='serif', fontweight='bold',
        horizontalalignment='center',
        verticalalignment='center')


# Graphs
for i, ax in enumerate(axes):
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)
        
    ax.set_facecolor(background_color)
    
    if i == 0:
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.tick_params(left=False, bottom=False)
        ax.spines[['bottom']].set_visible(False)
    else:
        ax.set_title(cont_features[i-1], fontsize=14, fontfamily='serif', fontweight='bold')
        ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
    
        sns.kdeplot(x=cont_features[i-1], data=df, ax=ax, hue='category', 
                    palette='spring_r', fill=True, legend=False)
        ax.set_xlabel('')
        ax.set_ylabel('')

In [None]:
fig = plt.figure(figsize=(18, 25))
gs = fig.add_gridspec(4, 3)
gs.update(wspace=0.3, hspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
ax3 = fig.add_subplot(gs[1, 0])
ax4 = fig.add_subplot(gs[1, 1])
ax5 = fig.add_subplot(gs[1, 2])

axes = [ax0, ax1, ax2, ax3, ax4, ax5]
fig.patch.set_facecolor(background_color)


# Graphs
for i, ax in enumerate(axes):
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)
        
    ax.set_facecolor(background_color)
    ax.set_title(cont_features[i+5], fontsize=14, fontfamily='serif', fontweight='bold')
    ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
    
    sns.kdeplot(x=cont_features[i+5], data=df, ax=ax, hue='category', 
                palette='spring_r', fill=True, legend=False)
    ax.set_xlabel('')
    ax.set_ylabel('')

I thought the distribution of 'Basic' and 'Luxury' would be quite different. But that was wrong..

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
df['category'] = label.fit_transform(df['category'])

# Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

X = df.drop('category', axis=1)
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Accuracy Score of Logistic Regression : ', accuracy_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

print('Accuracy Score of Suppor Vector Machine : ', accuracy_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print('Accuracy Score of Random Forest : ', accuracy_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

print('Accuracy Score of Decision Tree : ', accuracy_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

I have no idea why some models got perfect 1.0 accuracy score..

## Pleas Upvote if you like my notebook!
## Thank you!