In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.metrics import confusion_matrix
# import plotly.express as px

In [None]:
train_db = pd.read_csv(r"../input/paris-housing-classification/ParisHousingClass.csv")
train_db.sample(5)

In [None]:
# Encode 'category' feature
train = train_db.copy()
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
encoder.fit(train[['category']])
train[['category']] = encoder.transform(train[['category']])

# Split dataset
y = train['category'].copy().to_numpy().reshape(-1,1)
X = train.drop(['category'], axis=1).copy()

# Split train and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Exploratory Data Analysis

In [None]:
X_train.describe()

In [None]:
X_train.isnull().sum()

In [None]:
# Plot histogram of features
X_train.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# Boxplot the continuous features 
cont_features = ['squareMeters','numberOfRooms','floors','cityCode','made','basement','attic','garage','price']

fig = make_subplots(
    rows=2,
    cols=5,
    subplot_titles=cont_features
)

fig.append_trace(go.Box(y=X_train[cont_features[0]]),1,1)
fig.append_trace(go.Box(y=X_train[cont_features[1]]),1,2)
fig.append_trace(go.Box(y=X_train[cont_features[2]]),1,3)
fig.append_trace(go.Box(y=X_train[cont_features[3]]),1,4)
fig.append_trace(go.Box(y=X_train[cont_features[4]]),1,5)
fig.append_trace(go.Box(y=X_train[cont_features[5]]),2,1)
fig.append_trace(go.Box(y=X_train[cont_features[6]]),2,2)
fig.append_trace(go.Box(y=X_train[cont_features[7]]),2,3)
fig.append_trace(go.Box(y=X_train[cont_features[8]]),2,4)

fig.show()

In [None]:
print('No hay outliers en las variables continuas')

# Correlation Matrix

In [None]:
corr = train.corr()
corr.style.background_gradient(cmap='plasma').set_precision(2)

In [None]:
print('Unicamente las variables "hasYard", "hasPool" y "isNewBuilt" muestran correlacion significativa')

# Logistic regression model (raw & un-regularized)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

Xtrain = np.array(X_train)
ytrain = np.array(y_train).ravel()
Xtest = np.array(X_test)
ytest = np.array(y_test).ravel()

logreg_model = LogisticRegression(random_state=0).fit(Xtrain, ytrain)

accuracy = logreg_model.score(Xtrain, ytrain)
test_accuracy = logreg_model.score(Xtest, ytest)
bce_error = log_loss(ytrain,logreg_model.predict(Xtrain))

print(f"accuracy: {accuracy:.3f}, test_accuracy: {test_accuracy:.3f}, bce error: {bce_error:.3f}")

In [None]:
# Plot the confusion matrix
predictions = logreg_model.predict(Xtrain)
cm = confusion_matrix(ytrain, predictions)
class_names = train_db['category'].unique()

ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels');

ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(class_names); 
ax.yaxis.set_ticklabels(class_names);

print('El modelo predice erroneamente la variable "category"')

# Logistic regression model (normalized & un-regularized)

In [None]:
# Normalize data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X_train)
Xtrain_norm = scaler.transform(X_train)
Xtest_norm = scaler.transform(X_test)

In [None]:
logreg_model_norm = LogisticRegression(random_state=0).fit(Xtrain_norm, ytrain)

accuracy = logreg_model_norm.score(Xtrain_norm, ytrain)
test_accuracy = logreg_model_norm.score(Xtest_norm, ytest)
bce_error = log_loss(ytrain,logreg_model_norm.predict(Xtrain_norm))

print(f"accuracy: {accuracy:.3f}, test_accuracy: {test_accuracy:.3f}, bce error: {bce_error:.3f}")

In [None]:
predictions = logreg_model_norm.predict(Xtrain_norm)
cm = confusion_matrix(ytrain, predictions)
cm

# Logistic regression model (normalized & Lasso regularization)

In [None]:
# Find optimal regularization strength ('c')
c_vals = [100, 10, 0.05, 0.005]

for c in c_vals:
    logreg_l1_model = LogisticRegression(C=c, penalty="l1", solver="liblinear", max_iter=50)
    logreg_l1_model.fit(Xtest_norm, ytest)
    accuracy = logreg_l1_model.score(Xtrain_norm, ytrain)
    test_accuracy = logreg_l1_model.score(Xtest_norm, ytest)

    print(f"c: {c}, accuracy: {accuracy}, test_accuracy: {test_accuracy}")

In [None]:
# Logistic regression model L1 with c=1
c = 1
logreg_l1_model = LogisticRegression(C=1, penalty="l1", solver="liblinear", max_iter=50)
logreg_l1_model.fit(Xtrain_norm, ytrain)

accuracy_l1 = logreg_l1_model.score(Xtrain_norm, ytrain)
test_accuracy_l1 = logreg_l1_model.score(Xtest_norm, ytest)
bce_error_l1 = log_loss(ytrain,logreg_l1_model.predict(Xtrain_norm))

print(f"c: {c}, accuracy: {accuracy_l1:.3f}, test_accuracy: {test_accuracy_l1:.3f}, bce error: {bce_error_l1:.3f}")

# Logistic regression model (normalized & Ridge regularization)

In [None]:
# Find optimal regularization strength ('c')
c_vals = [100, 10, 0.05, 0.005]

for c in c_vals:
    logreg_l2_model = LogisticRegression(C=c, penalty="l2", solver="lbfgs", max_iter=50)
    logreg_l2_model.fit(Xtrain_norm, ytrain)
    accuracy = logreg_l2_model.score(Xtrain_norm, ytrain)
    test_accuracy = logreg_l2_model.score(Xtest_norm, ytest)

    print(f"c: {c}, accuracy: {accuracy}, test_accuracy: {test_accuracy}")

In [None]:
# Logistic regression model L2 with c=1
c = 1
logreg_l2_model = LogisticRegression(C=1, penalty="l2", solver="lbfgs", max_iter=50)
logreg_l2_model.fit(Xtrain_norm, ytrain)

accuracy_l2 = logreg_l2_model.score(Xtrain_norm, ytrain)
test_accuracy_l2 = logreg_l2_model.score(Xtest_norm, ytest)
bce_error_l2 = log_loss(ytrain,logreg_l2_model.predict(Xtrain_norm))

print(f"c: {c}, accuracy: {accuracy_l2:.3f}, test_accuracy: {test_accuracy_l2:.3f}, bce error: {bce_error_l2:.3f}")

# Comparison of coefficients across regression models

In [None]:
cols = list(train.columns[:-1])
log = logreg_model.coef_.ravel().T
log_norm = logreg_model_norm.coef_.ravel().T
l1 = logreg_l1_model.coef_.ravel().T
l2 = logreg_l2_model.coef_.ravel().T


df = pd.DataFrame(data=[log, log_norm, l1, l2,], columns=cols)
df = df.T
df.columns = ['un_reg', 'log_norm', 'l1_reg', 'l2_reg']

def highlight_max(s):
    is_max = s.abs() > 10.90
    return ['background: lightgreen' if cell else '' for cell in is_max]

print('Regression coeficients')
df.style.apply(highlight_max, subset = df.columns[-2:-1])

In [None]:
print('Los (3) features mas relevantes son "hasYard", "hasPool" y "isNewBuilt"')
print('Los (3) features menos relevantes "hasStormProtector", "hasStorageRoom" y"price"')