# Paris Housing Classification Problem
## Dataset loading

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import OrdinalEncoder, normalize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix





df = pd.read_csv('/kaggle/input/paris-housing-classification/ParisHousingClass.csv')
df.info()
df.head()

## Missing or incomplete data check

In [None]:
print("Null values:")
print(df.isnull().sum())
print("\n\nNaN values:")
print(df.isna().sum())

import seaborn as sns # !pip install seaborn

sns.heatmap(df.isnull(), cbar=False);

## Dataset description

In [None]:
df.describe()

## Box diagrams

In [None]:
fig = make_subplots(
    rows = 6,
    cols = 3,
    subplot_titles = df.columns
)

i = 1
j = 1

for col in df.columns:
    fig.append_trace(go.Box(y=df[::500][col]), i, j)
    if j < 3:
        j += 1
    else:
        j = 1
        i += 1
fig.update_layout(
    autosize=False,
    width=1200,
    height=2000,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)
fig.show()

## Data distribution

In [None]:
plt.subplots(figsize=(10,6))
plt.hist(df['category'].values, bins=3)
plt.xlabel('Category')
plt.ylabel('Number of records')
plt.show()

## Category Encoding

In [None]:
ordinal_encoder = OrdinalEncoder()

ordinal_encoder.fit(df[["category"]])

df[["category"]] = ordinal_encoder.transform(df[["category"]])

print("0 = basic", "1 = luxury")
df.head()

## Correlation Matrix

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='plasma').set_precision(2)


In [None]:
corr['category'].sort_values(ascending=False)

We can see here how `hasPool`, `isNewBuilt`, and `hasYard` variables have the highest correlation with the `category` variable.

# Train/Test data split

In [None]:
np_data = df.to_numpy(copy=True)
X_train, X_test, y_train, y_test = train_test_split(np_data[:,:16], np_data[:, 17], test_size=0.30, random_state=42)

## Normalization

In [None]:
X_train_norm = normalize(X_train)
y_train_norm = normalize(y_train.reshape(-1,1))
X_test_norm = normalize(X_test)
y_test_norm = normalize(y_test.reshape(-1, 1))

## Logistic regression

In [None]:
# We are going to use SoftMax
regressor =  LogisticRegression(random_state=0, max_iter=500, multi_class='multinomial').fit(X_train_norm, y_train_norm.reshape(-1))


### Accuracy

#### Train

In [None]:
regressor.score(X_train_norm, y_train_norm)

#### Test

In [None]:
regressor.score(X_test_norm, y_test_norm)

#### Coefficients

In [None]:
print(regressor.coef_)
print(regressor.intercept_)

### Predictions

In [None]:
pred_train = regressor.predict(X_train_norm)
pred_train

In [None]:
regressor.predict_proba(X_train_norm)

In [None]:
pred_test = regressor.predict(X_test_norm)
pred_test

#### Loss

In [None]:
log_loss(y_train_norm, pred_train)


In [None]:
log_loss(y_test_norm, pred_test)

### Confusion Matrix

In [None]:
cm = confusion_matrix(y_train_norm, pred_train)
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels');

ax.set_title('Confusion Matrix'); 
class_names = ['basic', 'luxury']
ax.xaxis.set_ticklabels(class_names); 
ax.yaxis.set_ticklabels(class_names);

## Most and least relevat features

Most relevant features:
1. Price
2. isNewBuild

Least relevant features:
1. made
2. garage
3. squareMeters