In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

import lightgbm as lgb


sys.path.append('../')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)


from config.paths import PROCESSED_DATA_PATH, CONFIG_PATH
from utils.file_management import read_yaml

In [None]:
seed = 42

In [None]:
df = pd.read_parquet(f'{PROCESSED_DATA_PATH}/preprocessed_dataset.parquet')
features = read_yaml(f'{CONFIG_PATH}/features.yaml')
target = 'price' #features['target']

# Train test split

In [None]:
pctg_train = 0.8
n_train = int(len(df) * pctg_train)
train_idx = df.sample(n=n_train, random_state=42).index
train = df.loc[train_idx]
test = df.loc[~df.index.isin(train_idx)]

In [None]:
def drop_features(df_):
    cols_to_use = [target, 'id', 'city', 'county', 'lotAreaUnits', 'parking', 'garageSpaces', 'hasGarage', 'pool',
                   'spa', 'homeType', 'livingAreaMts_log', 'yearBuilt', 'mapped_bathrooms', 'city_median_price',
                   'city_mean_price', 'county_median_price', 'county_mean_price', 'bedrooms', 'levels',
                   '5_knn_mean_price', '5_knn_median_price', '25_knn_mean_price', '25_knn_median_price']

    # Filtrar las columnas que existen en el DataFrame
    cols_to_use = [col for col in cols_to_use if col in df_.columns]

    df_ = df_[cols_to_use]

    return df_

In [None]:
def calculate_statistic(df, feature, statistic):
    df[f'{feature}_{statistic}_price'] = df.groupby(feature)['price'].transform(statistic)
    return df

In [None]:
import pandas as pd
from sklearn.neighbors import BallTree

def knn_property_price(df_, n_neighbors=3, statistic='mean'):

    df_['latitude'] = df_['latitude'].astype(float)
    df_['longitude'] = df_['longitude'].astype(float)

    # Combina las coordenadas en una matriz
    coords = df_[['latitude', 'longitude']].values

    # Construye un árbol de bolas para buscar vecinos cercanos eficientemente
    tree = BallTree(coords, leaf_size=15, metric='haversine')

    # Para cada propiedad, encuentra los índices de las N propiedades más cercanas
    _, indices = tree.query(coords, k=n_neighbors + 1)

    # Calcula la estadística especificada de las propiedades cercanas para cada fila
    if statistic == 'mean':
        df_[f'{n_neighbors}_knn_{statistic}_price'] = [df_.iloc[idx]['price'][1:].astype(float).mean() for idx in indices]
    elif statistic == 'median':
        df_[f'{n_neighbors}_knn_{statistic}_price'] = [df_.iloc[idx]['price'][1:].astype(float).median() for idx in indices]
    elif statistic == 'std':
        df_[f'{n_neighbors}_knn_{statistic}_price'] = [df_.iloc[idx]['price'][1:].astype(float).std() for idx in indices]
    # Puedes agregar más opciones según tus necesidades

    return df_

In [None]:
def encode_categorical_variables(df):
  """
  Encodes categorical variables in a dataset.

  Args:
    df: The dataset.

  Returns:
    The encoded dataset.
  """

  # Get the categorical variables.
  categorical_variables = [
      var for var in df.columns if df[var].dtype.name == "object"
  ]

  # Encode the categorical variables.
  for var in categorical_variables:
    df[var] = df[var].astype("category")
    df[var] = df[var].cat.codes

  return df


In [None]:
# Crear la transformación con FunctionTransformer
city_median_price = FunctionTransformer(calculate_statistic, validate=False, kw_args={'feature': 'city', 'statistic': 'median'})
city_mean_price = FunctionTransformer(calculate_statistic, validate=False, kw_args={'feature': 'city', 'statistic': 'mean'})
county_median_price = FunctionTransformer(calculate_statistic, validate=False, kw_args={'feature': 'county', 'statistic': 'median'})
county_mean_price = FunctionTransformer(calculate_statistic, validate=False, kw_args={'feature': 'county', 'statistic': 'mean'})
_5_knn_median_price = FunctionTransformer(knn_property_price, validate=False, kw_args={'n_neighbors':5, 'statistic':'median'})
_5_knn_mean_price = FunctionTransformer(knn_property_price, validate=False, kw_args={'n_neighbors':5, 'statistic':'mean'})
_25_knn_median_price = FunctionTransformer(knn_property_price, validate=False, kw_args={'n_neighbors':25, 'statistic':'median'})
_25_knn_mean_price = FunctionTransformer(knn_property_price, validate=False, kw_args={'n_neighbors':25, 'statistic':'mean'})
encoding = FunctionTransformer(encode_categorical_variables, validate=False)
drop_features = FunctionTransformer(drop_features, validate=False)


# Definir la pipeline
pipeline = Pipeline([
    ('city_median_price', city_median_price),
    ('city_mean_price', city_mean_price),
    ('county_median_price', county_median_price),
    ('county_mean_price', county_mean_price),
    ('5_knn_median_price', _5_knn_median_price),
    ('5_knn_mean_price', _5_knn_mean_price),
    ('25_knn_median_price', _25_knn_median_price),
    ('25_knn_mean_price', _25_knn_mean_price),
    ('encoding', encoding),
    ('drop_features', drop_features)

])

# Aplicar la pipeline a tus datos
train_transformed = pipeline.transform(train.copy())

# Model

In [None]:
train_transformed

In [None]:
train_transformed[(train_transformed['county']==38) & (train_transformed['price']==499888.000)]

In [None]:
X = train_transformed.drop(columns='price')
y = train_transformed['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
model = lgb.LGBMRegressor(random_state=seed, n_estimators=1000)
model.fit(X_train, y_train)

# Model evaluation

In [None]:
y_pred = model.predict(X_test)

In [None]:
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
mse = np.mean((y_test - y_pred)**2)
mae = np.mean(np.abs(y_test - y_pred))
median_absolute_error = np.median(np.abs(y_test - y_pred))
r2 = np.corrcoef(y_test, y_pred)[0, 1]**2

print(f'RMSE: {int(rmse)}')
print(f'MSE : {int(mse)}')
print(f'MAE : {int(mae)}')
print(f'MeAE: {int(median_absolute_error)}')
print(f'R2  : {round(r2, 3)}')

In [None]:
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
mse = np.mean((y_test - y_pred)**2)
mae = np.mean(np.abs(y_test - y_pred))
median_absolute_error = np.median(np.abs(y_test - y_pred))
r2 = np.corrcoef(y_test, y_pred)[0, 1]**2

print(f'RMSE: {int(rmse)}')
print(f'MSE : {int(mse)}')
print(f'MAE : {int(mae)}')
print(f'MeAE: {int(median_absolute_error)}')
print(f'R2  : {round(r2, 3)}')

In [None]:
errors = y_test - y_pred
plt.hist(errors)
plt.title("Distribution of the error")

# Desactivar la notación científica en ambos ejes
plt.ticklabel_format(style='plain', axis='both')
plt.xticks(rotation=45)
plt.show()

# Apply pipeine to test

In [None]:
test_transformed = pipeline.transform(test.copy())

In [None]:
test_preds = model.predict(test_transformed.drop(columns='price'))
test_actual = test_transformed['price']

In [None]:
test_rmse = np.sqrt(np.mean((test_actual - test_preds)**2))
test_mse = np.mean((test_actual - test_preds)**2)
test_mae = np.mean(np.abs(test_actual - test_preds))
test_median_absolute_error = np.median(np.abs(test_actual - test_preds))
test_r2 = np.corrcoef(test_actual, test_preds)[0, 1]**2

print(f'RMSE: {int(test_rmse)}')
print(f'MSE : {int(test_mse)}')
print(f'MAE : {int(test_mae)}')
print(f'MeAE: {int(test_median_absolute_error)}')
print(f'R2  : {round(test_r2, 3)}')

# Optuna

In [None]:
from utils.optimizer import optimize_lightgbm_params
from utils.train import train_lightgbm_model

In [None]:
# Uso de la función
# Supongamos que tienes un DataFrame df y el objetivo es predecir la columna 'target'
best_params = optimize_lightgbm_params(train_transformed.drop(target, axis=1), df[target], n_trials=25)
print(best_params)

In [None]:
model = train_lightgbm_model(train_transformed, target, best_params)

In [None]:
train_transformed

In [None]:
test_rmse = np.sqrt(np.mean((test_actual - test_preds)**2))
test_mse = np.mean((test_actual - test_preds)**2)
test_mae = np.mean(np.abs(test_actual - test_preds))
test_median_absolute_error = np.median(np.abs(test_actual - test_preds))
test_r2 = np.corrcoef(test_actual, test_preds)[0, 1]**2

print(f'RMSE: {int(test_rmse)}')
print(f'MSE : {int(test_mse)}')
print(f'MAE : {int(test_mae)}')
print(f'MeAE: {int(test_median_absolute_error)}')
print(f'R2  : {round(test_r2, 3)}')