In [102]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FunctionTransformer


In [103]:
# Load dataset
df = pd.read_csv('portugal_listinigs2.csv')

# Target variable
y = df['Price']
X = df.drop(columns=['Price'])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  df = pd.read_csv('portugal_listinigs2.csv')


Missing Values

In [104]:
def handle_missing_values(X, y=None):
    """Handles missing values and ensures y_train is updated when rows are dropped."""

    # Drop columns with > 58% missing values
    cols_to_drop = ['Floor', 'GrossArea', 'PublishDate', 'NumberOfBedrooms', 'NumberOfWC', 
                    'ConservationStatus', 'LotSize', 'BuiltArea']
    X = X.drop(columns=[col for col in cols_to_drop if col in X.columns], errors='ignore')

    # Fill missing ConstructionYear with random values (1960-2025)
    if 'ConstructionYear' in X.columns:
        X.loc[X['ConstructionYear'].isna(), 'ConstructionYear'] = np.random.randint(1960, 2025, X['ConstructionYear'].isna().sum())

    # Fill numerical missing values with median
    num_cols = ['LivingArea', 'TotalRooms', 'TotalArea', 'NumberOfBathrooms', 'Parking']
    for col in num_cols:
        if col in X.columns:
            X[col] = X[col].fillna(X[col].median())

    # Fill categorical missing values with 'Missing'
    cat_cols = ['EnergyEfficiencyLevel', 'Garage', 'ElectricCarsCharging', 'Town', 'Type', 'EnergyCertificate']
    for col in cat_cols:
        if col in X.columns:
            X[col] = X[col].fillna('Missing')

    # Ensure y_train matches X_train
    if y is not None:
        valid_rows = X.notnull().all(axis=1)
        X = X.loc[valid_rows]
        y = y.loc[valid_rows]
        return X, y

    return X


Feature Selection

In [105]:
def feature_selection(X, y=None):
    """Removes unwanted features and ensures y_train stays in sync."""
    if 'HasParking' in X.columns:
        X = X.drop(columns=['HasParking'], errors='ignore')

    # Drop rows where <10% of values are missing
    threshold = 0.10 * len(X)
    missing_counts = X.isnull().sum()
    cols_to_drop_rows = missing_counts[missing_counts <= threshold].index
    X = X.dropna(subset=cols_to_drop_rows)

    if y is not None:
        y = y.loc[X.index]  # Ensure y_train matches X_train
        return X, y

    return X


Outliers Handling

In [106]:
def handle_outliers(X, y=None):
    """Caps extreme values for numerical columns and ensures y_train stays in sync."""
    outlier_bounds = {
        'NumberOfBathrooms': (1, 15),
        'TotalRooms': (1, 25),
        'LivingArea': (0, 312),
        'TotalArea': (0, 1072),
        'ConstructionYear': (1900, 2025)
    }

    for col, (lower, upper) in outlier_bounds.items():
        if col in X.columns:
            X[col] = X[col].clip(lower=lower, upper=upper)

    if y is not None:
        y = y.loc[X.index]
        return X, y
    return X


Encode Categorical Features

In [107]:
class FeatureEncoder(BaseEstimator, TransformerMixin):
    """Encodes categorical features with One-Hot and Ordinal Encoding."""
    
    def __init__(self):
        self.encoder = None
        self.feature_names = None

    def fit(self, X, y=None):
        self.nominal = ['District', 'City', 'Town', 'Type', 'Garage', 'Elevator', 'ElectricCarsCharging']
        self.ordinal = ['EnergyCertificate', 'EnergyEfficiencyLevel']

        self.encoder = ColumnTransformer([
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), self.nominal),
            ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), self.ordinal)
        ])

        self.encoder.fit(X)

        # Store feature names
        self.feature_names = self.encoder.get_feature_names_out()

        return self

    def transform(self, X):
        X_transformed = self.encoder.transform(X)
        return pd.DataFrame(X_transformed, columns=self.feature_names, index=X.index)


Pipeline

In [108]:
full_pipeline = Pipeline([
    ('missing_values', FunctionTransformer(handle_missing_values, validate=False)),
    ('feature_selection', FunctionTransformer(feature_selection, validate=False)),
    ('outlier_handling', FunctionTransformer(handle_outliers, validate=False)),
    ('encoding', FeatureEncoder()),
    ('scaler', StandardScaler(with_mean=False)),  
    ('pca', PCA(n_components=0.95))
])


In [109]:
X_train, y_train = handle_missing_values(X_train, y_train)
X_train, y_train = feature_selection(X_train, y_train)
X_train, y_train = handle_outliers(X_train, y_train)

X_train_transformed = full_pipeline.fit_transform(X_train, y_train)
X_test_transformed = full_pipeline.transform(X_test)
