# 0. Libraries and importing the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import itertools
import math
from scipy import stats

We import the data from the final EDA dataset:

In [2]:
df3 = pd.read_csv('1_EDA_datasets/EDA_dataset_df3.csv')
df3

Unnamed: 0,id,num_rooms,num_baths,square_meters,orientation,year_built,is_furnished,has_pool,neighborhood,num_crimes,has_ac,accepts_pets,num_supermarkets,price,floor
0,9255,1.0,1.0,,,1956.0,False,False,Sant Martí,2.0,True,True,,1096,3.0
1,1562,4.0,1.0,133.0,west,1960.0,False,False,Sants,4.0,False,False,2.0,1396,7.0
2,1671,2.0,3.0,137.0,,2000.0,False,True,Eixample,0.0,False,False,,1263,1.0
3,6088,1.0,2.0,41.0,,2002.0,False,True,Sants,5.0,False,False,,1290,6.0
4,6670,2.0,1.0,70.0,,1979.0,True,False,Gràcia,0.0,False,True,3.0,962,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,5735,2.0,,95.0,south,2021.0,False,False,Sant Martí,0.0,False,False,,1066,9.0
7996,5192,4.0,3.0,166.0,,1995.0,False,False,Nou Barris,9.0,False,False,,1255,9.0
7997,5391,4.0,1.0,89.0,east,2015.0,True,False,Sant Martí,0.0,True,False,1.0,1220,9.0
7998,861,4.0,2.0,167.0,west,1977.0,False,True,Gràcia,0.0,True,False,,1547,8.0


In [3]:
# Convert the 'floor' column to string, removing the decimal point but keeping NaN values
df3['floor'] = df3['floor'].apply(lambda x: str(int(x)) if pd.notnull(x) else x)

In [4]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                8000 non-null   int64  
 1   num_rooms         8000 non-null   float64
 2   num_baths         7840 non-null   float64
 3   square_meters     7845 non-null   float64
 4   orientation       5659 non-null   object 
 5   year_built        7830 non-null   float64
 6   is_furnished      7835 non-null   object 
 7   has_pool          7844 non-null   object 
 8   neighborhood      7835 non-null   object 
 9   num_crimes        7840 non-null   float64
 10  has_ac            7831 non-null   object 
 11  accepts_pets      7845 non-null   object 
 12  num_supermarkets  1411 non-null   float64
 13  price             8000 non-null   int64  
 14  floor             7851 non-null   object 
dtypes: float64(6), int64(2), object(7)
memory usage: 937.6+ KB


## Preprocessing dataset 3: no OHE applied, no listwise deletion, no num_supermarkets and keeping outliers of num_rooms
Same dataset as the EDA dataset but without the number of supermarkets column!

In [4]:
df6 = df3.copy()
# We drop the number of supermarkets column, as it has too many null values
df6 = df6.drop(labels = 'num_supermarkets', axis = 1)

Now we check how null values are stored.

In [5]:
# Check if None is used as missing values
print(df6.isin([None]).sum())

# Check for specific placeholders like -999
print(df6.isin([-999]).sum())

# Check for empty strings (which might indicate missing values in string columns)
print(df6.isin(['']).sum())

id               0
num_rooms        0
num_baths        0
square_meters    0
orientation      0
year_built       0
is_furnished     0
has_pool         0
neighborhood     0
num_crimes       0
has_ac           0
accepts_pets     0
price            0
floor            0
dtype: int64
id               0
num_rooms        0
num_baths        0
square_meters    0
orientation      0
year_built       0
is_furnished     0
has_pool         0
neighborhood     0
num_crimes       0
has_ac           0
accepts_pets     0
price            0
floor            0
dtype: int64
id               0
num_rooms        0
num_baths        0
square_meters    0
orientation      0
year_built       0
is_furnished     0
has_pool         0
neighborhood     0
num_crimes       0
has_ac           0
accepts_pets     0
price            0
floor            0
dtype: int64


In [6]:
# Are they stored as np.nan?
print(df6.isin([np.nan]).sum())

id                  0
num_rooms           0
num_baths         160
square_meters     155
orientation      2341
year_built        170
is_furnished      165
has_pool          156
neighborhood      165
num_crimes        160
has_ac            169
accepts_pets      155
price               0
floor             149
dtype: int64


Missing values are stored as np.nan, so we're good for applying scikit-learn's imputers with this dataset!

In [9]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             8000 non-null   int64  
 1   num_rooms      8000 non-null   float64
 2   num_baths      7840 non-null   float64
 3   square_meters  7845 non-null   float64
 4   orientation    5659 non-null   object 
 5   year_built     7830 non-null   float64
 6   is_furnished   7835 non-null   object 
 7   has_pool       7844 non-null   object 
 8   neighborhood   7835 non-null   object 
 9   num_crimes     7840 non-null   float64
 10  has_ac         7831 non-null   object 
 11  accepts_pets   7845 non-null   object 
 12  price          8000 non-null   int64  
 13  floor          7851 non-null   object 
dtypes: float64(5), int64(2), object(7)
memory usage: 875.1+ KB


In [9]:
# df6.to_csv('2_preprocessed_datasets/v3_prepr-no_ohe-no_num_supm-with_num_rooms_outliers.csv', index = False)

## Preprocessing dataset 4: no OHE applied, no listwise deletion, no num_supermarkets and imputing outliers of num_rooms
Same dataset as the EDA dataset but without the number of supermarkets column, and with imputation of the outliers in `num_rooms`!

In [9]:
df7 = df6.copy()

Now, we impute the outliers in `num_rooms` with the median:

In [11]:
def detect_outliers_iqr(data, column_name, threshold=1.5):
    Q1 = data[column_name].quantile(0.25)
    Q3 = data[column_name].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR

    outliers = data[(data[column_name] < lower_bound) | (data[column_name] > upper_bound)]

    return outliers

def impute_outliers_with_median(data, column_name, outliers):
    # Calculate the median excluding outliers
    filtered_data = data[~data.index.isin(outliers.index)]
    median_without_outliers = filtered_data[column_name].median()

    # Impute the mean to the outliers
    data.loc[outliers.index, column_name] = median_without_outliers

    return data, median_without_outliers

In [18]:
# Detect outliers in column 'num_rooms'
outliers = detect_outliers_iqr(df7, 'num_rooms')

# Impute outliers with the mean of column 'A' (excluding outliers)
data, median_num_rooms_no_outliers = impute_outliers_with_median(df7, 'num_rooms', outliers)

data

Unnamed: 0,id,num_rooms,num_baths,square_meters,orientation,year_built,is_furnished,has_pool,neighborhood,num_crimes,has_ac,accepts_pets,price,floor
0,9255,1.0,1.0,,,1956.0,False,False,Sant Martí,2.0,True,True,1096,3
1,1562,4.0,1.0,133.0,west,1960.0,False,False,Sants,4.0,False,False,1396,7
2,1671,2.0,3.0,137.0,,2000.0,False,True,Eixample,0.0,False,False,1263,1
3,6088,1.0,2.0,41.0,,2002.0,False,True,Sants,5.0,False,False,1290,6
4,6670,2.0,1.0,70.0,,1979.0,True,False,Gràcia,0.0,False,True,962,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,5735,2.0,,95.0,south,2021.0,False,False,Sant Martí,0.0,False,False,1066,9
7996,5192,4.0,3.0,166.0,,1995.0,False,False,Nou Barris,9.0,False,False,1255,9
7997,5391,4.0,1.0,89.0,east,2015.0,True,False,Sant Martí,0.0,True,False,1220,9
7998,861,4.0,2.0,167.0,west,1977.0,False,True,Gràcia,0.0,True,False,1547,8


And this is the median of the number of rooms without the number of outliers (which must be applied to the test dataset):

In [19]:
median_num_rooms_no_outliers

2.0

In [20]:
# df7.to_csv('2_preprocessed_datasets/v4_prepr-no_ohe-no_num_supm-no_num_rooms_outliers.csv', index = False)

# 2. Preprocessing

## 2.1. Feature creation
Create features that may be useful for improving the predictions (before splitting the data *in case that feature creation is NOT done taking into consideration the specific values of the dataset*). See notebook `2_Data_leakage.ipynb`. Suggestions:
- Column 'door' could be transformed into another feature: already done in 1.1.3!

## 2.2. Dealing with categorical variables
Be wary of possible data leakage between the training and validation set. 

Given that there are not many features and there is a significant number of observations (excluding the variable of the number of supermarkets if it is deleted, there are more than a 5000 observations), probably the best way to proceed is to apply one-hot-encoding to the categorical variables.

Only `neighborhood`, `floor` and `orientation` must be encoded, as the other categorical variables are already binary. In principle, there should be no issue with doing this and data leakage, as will not be chanelling any information from the training to the validation dataset.

In [5]:
def ohe_join(dataframe: pd.DataFrame, variables_to_encode: list, na_column: bool):
    for column in variables_to_encode:
        # We encode OHE each variable, creating an additional column which stores NaN
        ohe = pd.get_dummies(data = dataframe[column], drop_first = False, dummy_na = na_column)
        
        # Rename columns to avoid potential conflicts
        ohe.columns = [f"{column}_{str(col)}" for col in ohe.columns]
        
        # Join the encoded variable to the input dataframe
        dataframe = dataframe.join(ohe)
        
        # Drop the original variable from the dataframe
        dataframe = dataframe.drop(labels=column, axis=1)
    
    return dataframe

In [6]:
df3_ohe_test = ohe_join(dataframe = df3, variables_to_encode = ['orientation', 'neighborhood', 'floor'], na_column = True)
df3_ohe_test

Unnamed: 0,id,num_rooms,num_baths,square_meters,year_built,is_furnished,has_pool,num_crimes,has_ac,accepts_pets,...,floor_10,floor_2,floor_3,floor_4,floor_5,floor_6,floor_7,floor_8,floor_9,floor_nan
0,9255,1.0,1.0,,1956.0,False,False,2.0,True,True,...,False,False,True,False,False,False,False,False,False,False
1,1562,4.0,1.0,133.0,1960.0,False,False,4.0,False,False,...,False,False,False,False,False,False,True,False,False,False
2,1671,2.0,3.0,137.0,2000.0,False,True,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
3,6088,1.0,2.0,41.0,2002.0,False,True,5.0,False,False,...,False,False,False,False,False,True,False,False,False,False
4,6670,2.0,1.0,70.0,1979.0,True,False,0.0,False,True,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,5735,2.0,,95.0,2021.0,False,False,0.0,False,False,...,False,False,False,False,False,False,False,False,True,False
7996,5192,4.0,3.0,166.0,1995.0,False,False,9.0,False,False,...,False,False,False,False,False,False,False,False,True,False
7997,5391,4.0,1.0,89.0,2015.0,True,False,0.0,True,False,...,False,False,False,False,False,False,False,False,True,False
7998,861,4.0,2.0,167.0,1977.0,False,True,0.0,True,False,...,False,False,False,False,False,False,False,True,False,False


Be mindful that, ***for filtering null values of each of the categorical variables to which OHE has been applied, we must consider their NaN column, where "True" denotes that the corresponding observation had a null value in the categorical variable***. See the example below, where we subtract 8000 (the total number of observations) to the number of `True`s in a column which keeps track of NaN.

In [7]:
8000 - df3_ohe_test['floor_nan'].sum()

7851

In [8]:
print(df3_ohe_test.columns)
len(df3_ohe_test.columns)

Index(['id', 'num_rooms', 'num_baths', 'square_meters', 'year_built',
       'is_furnished', 'has_pool', 'num_crimes', 'has_ac', 'accepts_pets',
       'num_supermarkets', 'price', 'orientation_east', 'orientation_north',
       'orientation_south', 'orientation_west', 'orientation_nan',
       'neighborhood_Ciutat Vella', 'neighborhood_Eixample',
       'neighborhood_Gràcia', 'neighborhood_Horta', 'neighborhood_Les Cors',
       'neighborhood_Nou Barris', 'neighborhood_Sant Andreu',
       'neighborhood_Sant Martí', 'neighborhood_Sants',
       'neighborhood_Sarrià-Sant Gervasi', 'neighborhood_nan', 'floor_1',
       'floor_10', 'floor_2', 'floor_3', 'floor_4', 'floor_5', 'floor_6',
       'floor_7', 'floor_8', 'floor_9', 'floor_nan'],
      dtype='object')


39

---------

In [9]:
df3_ohe_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 39 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                8000 non-null   int64  
 1   num_rooms                         8000 non-null   float64
 2   num_baths                         7840 non-null   float64
 3   square_meters                     7845 non-null   float64
 4   year_built                        7830 non-null   float64
 5   is_furnished                      7835 non-null   object 
 6   has_pool                          7844 non-null   object 
 7   num_crimes                        7840 non-null   float64
 8   has_ac                            7831 non-null   object 
 9   accepts_pets                      7845 non-null   object 
 10  num_supermarkets                  1411 non-null   float64
 11  price                             8000 non-null   int64  
 12  orient

## 2.3. Splitting the data into a training and a validation set
Consider whether to apply n-fold cross validation here or when training the model. ***See training notebook!!***

## 2.4. Cleaning the data

### 2.4.1. Handling missing values

#### i. Testing whether data is missing completely at random (MCAR), missing at random (MAR) or missing not at random (MNAR)

References:
- https://stefvanbuuren.name/fimd/sec-MCAR.html
- https://campus.datacamp.com/courses/scalable-data-processing-in-r/case-study-a-preliminary-analysis-of-the-housing-data?ex=4

Types of missing data:
- **MCAR**: When data are missing completely at random there is no way to predict where in the data set we'll see a missing value. In an analysis this can often be handled by simply dropping rows of a data set with missing values. 
- **MAR**: When missingness is associated with other variables we call it missing at random. This name is a misnomer. We really mean that *conditioned on some of the variables in the data set, the data are missing completely at random*. To deal with MAR data we generally predict values for the missing data several times (i.e., multiple imputations) to create multiple data sets that capture the statistical structure of the relationships between the variables and then perform an analysis on the data sets. This procedure is called multiple imputation. 
- **MNAR**: The last category, missing not at random is for the case where data is neither MAR nor MCAR. It is usually caused by deterministic relationships between missingness and other measurements. 

There is no direct way to check if the data are MCAR, so, we are going to check if the data are MAR, and if they are not, we will assume that the data are missing completely at random. To check if your data are MAR:
1) Take each column with missingness and recode it as one if it is missing and zero otherwise. 
2) Then regress each of the the other variables onto it using a logistic regression.
3) A significant p-value indicates an association between the regressor and missingness, meaning your data are MAR. 
4) If none are significant, then it's plausible that the data are missing completely at random. Because you are testing multiple hypotheses you will likely get some p-values that are small by chance. As a result you may need to adjust your cutoff for significance based on how many regressions you perform. 

Therefore, as in this case 12 out of 13 regressors have missing values, we should estimate 12 regressions where each feature with missing values is regressed on by the rest of the features, including the price.

Another way to test this is to check whether the columns with nulls (excluding the number of supermarkets) have an average price different than the observations which don't have nulls (Miguel Conner). If there is a significant difference, this could signal that rows with null values are following a different pattern than those that don't have null values.

In [10]:
# Identify columns that may contain nulls (excluding 'num_supermarkets')
columns_with_nulls = ['num_baths', 'square_meters', 'orientation', 'year_built', 
                      'is_furnished', 'has_pool', 'neighborhood', 
                      'num_crimes', 'has_ac', 'accepts_pets', 'floor']

# Create a boolean mask for rows with any nulls in the specified columns
mask_with_nulls = df3[columns_with_nulls].isnull().any(axis=1)

# Separate the DataFrame into two groups
group_with_nulls = df3[mask_with_nulls]
group_without_nulls = df3[~mask_with_nulls]

# Calculate average prices
avg_price_with_nulls = group_with_nulls['price'].mean()
avg_price_without_nulls = group_without_nulls['price'].mean()

# Display the results
print(f"Average price of observations with nulls: {avg_price_with_nulls}")
print(f"Average price of observations without nulls: {avg_price_without_nulls}")

Average price of observations with nulls: 1107.3341232227488
Average price of observations without nulls: 1093.981401384083


We can now check if the difference is statistically significant:

In [11]:
# Perform a t-test
t_stat, p_value = stats.ttest_ind(
    group_with_nulls['price'],
    group_without_nulls['price'],
    equal_var=False  # Use False if you assume unequal variance
)

print(f"T-statistic: {t_stat}, P-value: {p_value}")

# Determine significance level (e.g., alpha = 0.05)
alpha = 0.05
if p_value < alpha:
    print("The difference in average prices is statistically significant at alpha", alpha)
else:
    print("The difference in average prices is not statistically significant at alpha", alpha)

T-statistic: 2.164010231366908, P-value: 0.030496436700082827
The difference in average prices is statistically significant at alpha 0.05


***Since the difference in average prices of observations with null values and those that don't have null values is statistically significant, data cannot be assumed to be MCAR (so listwise deletion is not the way to go)!***

What happens if we exclude the columns `num_supermarkets` AND `orientation`?

In [12]:
# Identify columns that may contain nulls (excluding 'num_supermarkets')
columns_with_nulls = ['num_baths', 'square_meters', 'year_built', 
                      'is_furnished', 'has_pool', 'neighborhood', 
                      'num_crimes', 'has_ac', 'accepts_pets', 'floor']

# Create a boolean mask for rows with any nulls in the specified columns
mask_with_nulls = df3[columns_with_nulls].isnull().any(axis=1)

# Separate the DataFrame into two groups
group_with_nulls = df3[mask_with_nulls]
group_without_nulls = df3[~mask_with_nulls]

# Calculate average prices
avg_price_with_nulls = group_with_nulls['price'].mean()
avg_price_without_nulls = group_without_nulls['price'].mean()

# Display the results
print(f"Average price of observations with nulls: {avg_price_with_nulls}")
print(f"Average price of observations without nulls: {avg_price_without_nulls}")

Average price of observations with nulls: 1114.2196610169492
Average price of observations without nulls: 1096.3150957854407


In [13]:
# Perform a t-test
t_stat, p_value = stats.ttest_ind(
    group_with_nulls['price'],
    group_without_nulls['price'],
    equal_var=False  # Use False if you assume unequal variance
)

print(f"T-statistic: {t_stat}, P-value: {p_value}")

# Determine significance level (e.g., alpha = 0.05)
alpha = 0.05
if p_value < alpha:
    print("The difference in average prices is statistically significant at alpha", alpha)
else:
    print("The difference in average prices is not statistically significant at alpha", alpha)

T-statistic: 2.290535539337071, P-value: 0.02208471166633584
The difference in average prices is statistically significant at alpha 0.05


***We get the same result, so removing orientation does NOT solve the problem!!!***

#### ii. Listwise deletion
Only applicable if data is MCAR (if not, deletion could be biased). If all rows with NaNs except for the ones in num_supermarkets are dropped, is the number of observations reduced too much?

In [14]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                8000 non-null   int64  
 1   num_rooms         8000 non-null   float64
 2   num_baths         7840 non-null   float64
 3   square_meters     7845 non-null   float64
 4   orientation       5659 non-null   object 
 5   year_built        7830 non-null   float64
 6   is_furnished      7835 non-null   object 
 7   has_pool          7844 non-null   object 
 8   neighborhood      7835 non-null   object 
 9   num_crimes        7840 non-null   float64
 10  has_ac            7831 non-null   object 
 11  accepts_pets      7845 non-null   object 
 12  num_supermarkets  1411 non-null   float64
 13  price             8000 non-null   int64  
 14  floor             7851 non-null   object 
dtypes: float64(6), int64(2), object(7)
memory usage: 937.6+ KB


In [15]:
def listwise_deletion(df: pd.DataFrame, column_to_exclude: str):
        df_listwise_deleted = df.dropna(subset=[col for col in df.columns if col != column_to_exclude])
        print(f'Number of dropped observations: {df.shape[0] - df_listwise_deleted.shape[0]}, or {(df.shape[0] - df_listwise_deleted.shape[0]) * 100 / df.shape[0]}%')
        return df_listwise_deleted

First, we apply listwise deletion to the data frame before applying OHE, so that we can more easily delete columns than after applying OHE:

In [16]:
df3_listwise_deleted = listwise_deletion(df = df3, column_to_exclude = 'num_supermarkets')
df3_listwise_deleted

Number of dropped observations: 3376, or 42.2%


Unnamed: 0,id,num_rooms,num_baths,square_meters,orientation,year_built,is_furnished,has_pool,neighborhood,num_crimes,has_ac,accepts_pets,num_supermarkets,price,floor
1,1562,4.0,1.0,133.0,west,1960.0,False,False,Sants,4.0,False,False,2.0,1396,7
5,5934,4.0,2.0,77.0,west,1987.0,True,True,Eixample,0.0,False,False,3.0,760,1
8,3509,1.0,1.0,59.0,south,1969.0,True,False,Gràcia,0.0,False,False,,933,5
10,5867,3.0,3.0,101.0,south,2014.0,False,False,Gràcia,9.0,False,True,,1124,7
11,169,3.0,2.0,115.0,south,2018.0,False,False,Sarrià-Sant Gervasi,0.0,False,True,,1005,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7992,4427,3.0,2.0,125.0,north,1996.0,False,False,Sant Martí,4.0,True,False,,1263,10
7993,467,4.0,1.0,157.0,south,1992.0,True,False,Eixample,0.0,False,True,,1709,6
7994,6266,3.0,3.0,62.0,south,1979.0,True,True,Sarrià-Sant Gervasi,0.0,True,True,,928,2
7997,5391,4.0,1.0,89.0,east,2015.0,True,False,Sant Martí,0.0,True,False,1.0,1220,9


42.2% of the observations are dropped if all null values of the features (except for the number of supermarkets) are dropped. If we assume that the number of supermarkets close is not an important predicting feature (which will be checked later), we can drop this column as there are too much null values.

In [17]:
df3_listwise_deleted_nosupm = df3_listwise_deleted.dropna(axis = 1)
df3_listwise_deleted_nosupm.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4624 entries, 1 to 7998
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             4624 non-null   int64  
 1   num_rooms      4624 non-null   float64
 2   num_baths      4624 non-null   float64
 3   square_meters  4624 non-null   float64
 4   orientation    4624 non-null   object 
 5   year_built     4624 non-null   float64
 6   is_furnished   4624 non-null   object 
 7   has_pool       4624 non-null   object 
 8   neighborhood   4624 non-null   object 
 9   num_crimes     4624 non-null   float64
 10  has_ac         4624 non-null   object 
 11  accepts_pets   4624 non-null   object 
 12  price          4624 non-null   int64  
 13  floor          4624 non-null   object 
dtypes: float64(5), int64(2), object(7)
memory usage: 541.9+ KB


Now, we apply OHE to the resulting data frame:

In [18]:
df4 = ohe_join(dataframe = df3_listwise_deleted_nosupm,
               variables_to_encode = ['orientation', 'neighborhood', 'floor'],
               na_column = False)

We set the dummy columns to booleans:

In [19]:
for column in ['is_furnished', 'has_pool', 'has_ac', 'accepts_pets']:
        df4[column] = df4[column].astype(dtype = bool)

In [20]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4624 entries, 1 to 7998
Data columns (total 35 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                4624 non-null   int64  
 1   num_rooms                         4624 non-null   float64
 2   num_baths                         4624 non-null   float64
 3   square_meters                     4624 non-null   float64
 4   year_built                        4624 non-null   float64
 5   is_furnished                      4624 non-null   bool   
 6   has_pool                          4624 non-null   bool   
 7   num_crimes                        4624 non-null   float64
 8   has_ac                            4624 non-null   bool   
 9   accepts_pets                      4624 non-null   bool   
 10  price                             4624 non-null   int64  
 11  orientation_east                  4624 non-null   bool   
 12  orientation

Now, we could create a new list with the categorical features of df4, to access them more easily:

In [21]:
# Lists to store column names
floor_dummies = [f'floor_{str(x)}' for x in range(1, 11)]
neighborhood_dummies = ['neighborhood_Ciutat Vella', 'neighborhood_Eixample', 'neighborhood_Gràcia', 'neighborhood_Horta', 
                        'neighborhood_Les Cors', 'neighborhood_Nou Barris',
                        'neighborhood_Sant Andreu', 'neighborhood_Sant Martí', 
                        'neighborhood_Sants', 'neighborhood_Sarrià-Sant Gervasi']
orientation_dummies = ['orientation_east', 'orientation_north', 'orientation_south', 'orientation_west']

categorical_features_df4 = ['is_furnished', 'has_pool', 'has_ac','accepts_pets', 
                            orientation_dummies, neighborhood_dummies,floor_dummies]

categorical_features_df4

['is_furnished',
 'has_pool',
 'has_ac',
 'accepts_pets',
 ['orientation_east',
  'orientation_north',
  'orientation_south',
  'orientation_west'],
 ['neighborhood_Ciutat Vella',
  'neighborhood_Eixample',
  'neighborhood_Gràcia',
  'neighborhood_Horta',
  'neighborhood_Les Cors',
  'neighborhood_Nou Barris',
  'neighborhood_Sant Andreu',
  'neighborhood_Sant Martí',
  'neighborhood_Sants',
  'neighborhood_Sarrià-Sant Gervasi'],
 ['floor_1',
  'floor_2',
  'floor_3',
  'floor_4',
  'floor_5',
  'floor_6',
  'floor_7',
  'floor_8',
  'floor_9',
  'floor_10']]

In [22]:
print(len(categorical_features_df4))

7


And we also subtract the `num_supermarkets` variable from the numerical_features list:

In [23]:
numerical_features_df4 = ['num_rooms',
 'num_baths',
 'square_meters',
 'year_built',
 'num_crimes']
numerical_features_df4

['num_rooms', 'num_baths', 'square_meters', 'year_built', 'num_crimes']

Finally, we create a list which contains all of the features:

In [24]:
features_df4 = numerical_features_df4 + categorical_features_df4
print(features_df4)
print(len(features_df4))

['num_rooms', 'num_baths', 'square_meters', 'year_built', 'num_crimes', 'is_furnished', 'has_pool', 'has_ac', 'accepts_pets', ['orientation_east', 'orientation_north', 'orientation_south', 'orientation_west'], ['neighborhood_Ciutat Vella', 'neighborhood_Eixample', 'neighborhood_Gràcia', 'neighborhood_Horta', 'neighborhood_Les Cors', 'neighborhood_Nou Barris', 'neighborhood_Sant Andreu', 'neighborhood_Sant Martí', 'neighborhood_Sants', 'neighborhood_Sarrià-Sant Gervasi'], ['floor_1', 'floor_2', 'floor_3', 'floor_4', 'floor_5', 'floor_6', 'floor_7', 'floor_8', 'floor_9', 'floor_10']]
12


##### Preprocessing dataset 1: OHE applied, listwise deletion with no num_supermarkets and keeping outliers of num_rooms 

In [30]:
# df4.to_csv('2_preprocessed_datasets/v1_prepr_ohe-listwise_deletion-no_num_supm-with_num_rooms_outliers.csv', index = False)

##### Preprocessing dataset 2: OHE applied, no listwise deletion, no num_supermarkets and keeping outliers of num_rooms

In [26]:
df5 = df3.copy()

df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                8000 non-null   int64  
 1   num_rooms         8000 non-null   float64
 2   num_baths         7840 non-null   float64
 3   square_meters     7845 non-null   float64
 4   orientation       5659 non-null   object 
 5   year_built        7830 non-null   float64
 6   is_furnished      7835 non-null   object 
 7   has_pool          7844 non-null   object 
 8   neighborhood      7835 non-null   object 
 9   num_crimes        7840 non-null   float64
 10  has_ac            7831 non-null   object 
 11  accepts_pets      7845 non-null   object 
 12  num_supermarkets  1411 non-null   float64
 13  price             8000 non-null   int64  
 14  floor             7851 non-null   object 
dtypes: float64(6), int64(2), object(7)
memory usage: 937.6+ KB


In [27]:
# Drop number of supermarkets column
df5.drop(labels = 'num_supermarkets', axis = 1, inplace = True)

# Apply OHE
df5 = ohe_join(dataframe = df5,
               variables_to_encode = ['orientation', 'neighborhood', 'floor'],
               na_column = True)

df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 38 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                8000 non-null   int64  
 1   num_rooms                         8000 non-null   float64
 2   num_baths                         7840 non-null   float64
 3   square_meters                     7845 non-null   float64
 4   year_built                        7830 non-null   float64
 5   is_furnished                      7835 non-null   object 
 6   has_pool                          7844 non-null   object 
 7   num_crimes                        7840 non-null   float64
 8   has_ac                            7831 non-null   object 
 9   accepts_pets                      7845 non-null   object 
 10  price                             8000 non-null   int64  
 11  orientation_east                  8000 non-null   bool   
 12  orient

In [31]:
# df5.to_csv('2_preprocessed_datasets/v2_prepr_ohe_with_na_dummies-no_num_supm-with_num_rooms_outliers.csv', index = False)

#### iii. Imputing the median
If done, done inside the pipeline of training (after splitting). See training notebook.

#### iv. KNN imputer
If done, done inside the pipeline of training (after splitting). See training notebook.

### 2.4.2. Handling outliers
Remember to handle outliers of `num_rooms`!!

## 2.5. Feature scaling and normalization
To be able to compare the magnitudes of the coefficients of the linear regression model and to select features based on this criteria, it is necessary to standardize the data. According to Anna, it is NOT necessary to standardize dummy variables. Apply the same transformation of the training data to the test data, but separately!
 - Standardization is also essential before performing regularization methods like LASSO and Ridge.
 - `StandardScaler()` from scikit-learn.

### 2.5.1. Natural log transformation
This transformation works well for:
- right-skewed data,
- compressed data, and 
- data with large outliers. 

After we log transform our data, one large benefit is that it will allow the data to be closer to a “normal”  distribution. It also changes the scale so our data points will drastically reduce the range of their values. But leep in mind, just because your data is skewed does not mean that a log transformation is the best answer. You would not want to log transform your feature if:

1. You have values less than 0. The natural logarithm of a negative number is undefined.
2. You have left-skewed data. That data may call for a square or cube transformation.
3. You have non-parametric data.

No log transformation applied here.

### 2.5.2. Standardization
If done, done inside the pipeline of training (after splitting). See training notebook.