In [84]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

%matplotlib inline

df = pd.read_csv('clean_data/data_set_CLEAN.csv')
features = df.columns
# Define the order of categories
energy_class_order = ['A+', 'A', 'B', 'C', 'D', 'E', 'F']

# Convert the "Energy Class" column to categorical with the specified order
df['Energy class'] = pd.Categorical(df['Energy class'], categories=energy_class_order, ordered=True)

df['price_sqm'] = df['Price (euro)'] / df['Living surface (sqm)']

print(df.info())

KeyError: 'Energy Class'

In [83]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Assume pcadf is already defined as a DataFrame

# Drop rows with missing values
pcadf = pd.DataFrame(df).dropna()

# Drop specified columns
columns_to_drop = ['Property ID', 'price_sqm', 'Price (euro)', 'Garden boolean', 'Parking boolean', 'Terrace boolean',
                   'Bathrooms total nb boolean', 'Bathrooms', 'Shower rooms', 'Outdoor parking spaces', 
                   'Covered parking spaces', 'Postal code']
pcadf.drop(columns=columns_to_drop, inplace=True)

# Selecting numerical columns for PCA
numerical_columns = pcadf.select_dtypes(include=['float64', 'int64']).columns

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(pcadf[numerical_columns])

# Perform PCA
pca = PCA()
pca.fit(scaled_data)

# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Extracting column names and their explained variance
column_names = pcadf[numerical_columns].columns
variance_explained = [(column_names[i], explained_variance_ratio[i] * 100) for i in range(len(column_names))]

# Grouping the column names and their explained variance
num_groups = 7  # Change the number of groups here
groupings = {}
for i, (column, variance) in enumerate(variance_explained):
    group = f'Group {i // (len(variance_explained) // num_groups) + 1}'
    if group not in groupings:
        groupings[group] = []
    groupings[group].append((column, variance))

for group, columns in groupings.items():
    print(f"{group}:\n")
    for column, variance in columns:
        print(f"{column}: {variance:.2f}%")
    print("\n")


Group 1:

Construction year: 31.90%
New Construction boolean: 12.40%


Group 2:

Building condition boolean: 10.50%
Double glazing boolean: 7.13%


Group 3:

Elevator boolean: 6.82%
Accessible for disabled people boolean: 5.66%


Group 4:

Living surface (sqm): 4.80%
Furnished boolean: 4.36%


Group 5:

Nb of Bedrooms: 3.98%
Bathrooms total nb: 3.26%


Group 6:

Kitchen equipped boolean: 3.09%
Open fire: 2.40%


Group 7:

Number of frontages: 1.74%
Swimming pool boolean: 0.80%


Group 8:

Plot surface (sqm): 0.60%
Terrace surface (sqm): 0.56%


Group 9:

Garden surface (sqm): 0.00%
Parking tot nb: 0.00%


Group 10:

Flood safe boolean: 0.00%


