In [None]:
import numpy as np
import pandas as pd

import os

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Data

In [None]:
paris = pd.read_csv("/kaggle/input/paris-housing-classification/ParisHousingClass.csv")
paris.head()

In [None]:
paris.describe()

# NaN values

In [None]:
print(paris.isnull().sum())

# Histograms

In [None]:
fig = make_subplots(rows=(paris.shape[1]//3)+1, cols=3)

for i, col in enumerate(paris.columns):
    fig.add_trace(go.Histogram(x=paris[col], name=col), row=(i//3)+1, col=(i%3)+1)
    
fig.update_layout(height=1500,)
    
fig.show()

# Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

encoder.fit(paris["category"])

paris["category"] = encoder.transform(paris["category"])
paris

# Correlation

In [None]:
corr = paris.corr()

fig = px.imshow(corr)
fig.show()

# Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

need_scaling = ['squareMeters', 'numberOfRooms', 'floors', 'cityCode', 'made', 'basement', 'attic', 'garage', 'price']

standard_scaler = StandardScaler()
paris[need_scaling] = standard_scaler.fit_transform(paris[need_scaling])

# X and Y

In [None]:
X = paris.drop("category", axis=1)
Y = paris["category"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression(
    class_weight='balanced',
    solver='newton-cg',
    max_iter=10000,
    C=0.001
)
logreg_model.fit(X_train, y_train)

# Evaluation

In [None]:
acc_train = logreg_model.score(X_train, y_train)
acc_test = logreg_model.score(X_test, y_test)

print(f"accuracy: {acc_train}, accuracy (test): {acc_test}")

# Top Features

In [None]:
features_relevancy = pd.DataFrame({
    'feature': X.columns,
    'coef': np.abs(logreg_model.coef_[0])
})

In [None]:
# Most relevant features

print("Most relevant features")

features_relevancy.sort_values(by='coef', ascending=False).head(2)

In [None]:
# Least relevant features

print("Least relevant features")

features_relevancy.sort_values(by='coef').head(2)