In [None]:
import pandas as pd
import sys 
sys.path.append('../')
pd.set_option('display.max_columns', None)

from utils.processing import *
from utils.file_management import read_yaml
from utils.train import train_lightgbm_model
from utils.evaluation import mape_score
from config.paths import CONFIG_PATH, PROCESSED_DATA_PATH
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

import lightgbm as lgb

In [2]:
df = import_raw_dataset()
features = read_yaml(f'{CONFIG_PATH}/features.yaml')

Successfully import /Users/robertogarces/data-science/projects/real-state/data/raw/RealEstate_California.csv


In [3]:
target = features['target'][0]
target

'price'

In [4]:
df = remove_duplicated_ids(df)
df = remove_price_outliers(df, lower_bound=2.5, upper_bound=97.5)

29500.0 5500000.0


In [5]:
pctg_train = 0.8
n_train = int(len(df) * pctg_train)
train_idx = df.sample(n=n_train, random_state=42).index
train = df.loc[train_idx]
test = df.loc[~df.index.isin(train_idx)]

In [6]:
train_idx

Int64Index([11623, 20094, 35289, 27607, 14422, 13527, 29286, 11156,  7968,
            11055,
            ...
            11272, 25538, 13652,  6993,  3610, 26674, 17474,  7145, 28817,
             7398],
           dtype='int64', length=23556)

# Preprocessing

In [7]:
transform_price_log = FunctionTransformer(transform_price_log, validate=False)
transform_area_units = FunctionTransformer(transform_area_units, validate=False)
categorize_bedrooms = FunctionTransformer(categorize_bedrooms, validate=False)
categorize_bathrooms = FunctionTransformer(categorize_bathrooms, validate=False)
categorize_yearBuilt = FunctionTransformer(categorize_yearBuilt, validate=False)
remove_garageSpaces_outliers = FunctionTransformer(remove_garageSpaces_outliers, validate=False)
map_levels = FunctionTransformer(map_levels, validate=False)
process_homeType = FunctionTransformer(process_homeType, validate=False)
impute_hasGarage = FunctionTransformer(impute_hasGarage, validate=False)
city_median_price = FunctionTransformer(calculate_statistic, validate=False, kw_args={'feature': 'city', 'statistic': 'median'})
city_mean_price = FunctionTransformer(calculate_statistic, validate=False, kw_args={'feature': 'city', 'statistic': 'mean'})
county_median_price = FunctionTransformer(calculate_statistic, validate=False, kw_args={'feature': 'county', 'statistic': 'median'})
county_mean_price = FunctionTransformer(calculate_statistic, validate=False, kw_args={'feature': 'county', 'statistic': 'mean'})
_5_knn_median_price = FunctionTransformer(knn_property_price, validate=False, kw_args={'n_neighbors':5, 'statistic':'median'})
_5_knn_mean_price = FunctionTransformer(knn_property_price, validate=False, kw_args={'n_neighbors':5, 'statistic':'mean'})
_25_knn_median_price = FunctionTransformer(knn_property_price, validate=False, kw_args={'n_neighbors':25, 'statistic':'median'})
_25_knn_mean_price = FunctionTransformer(knn_property_price, validate=False, kw_args={'n_neighbors':25, 'statistic':'mean'})
encoding = FunctionTransformer(encode_categorical_variables, validate=False)
drop_features = FunctionTransformer(drop_features, validate=False, kw_args={'target':target})

In [8]:
preprocessing_steps = [
    ('transform_price_log', transform_price_log),
    ('transform_area_units', transform_area_units),
    ('categorize_bedrooms', categorize_bedrooms),
    ('categorize_bathrooms', categorize_bathrooms),
    ('categorize_yearBuilt', categorize_yearBuilt),
    ('remove_garageSpaces_outliers', remove_garageSpaces_outliers),
    ('map_levels', map_levels),
    ('process_homeType', process_homeType),
    ('impute_hasGarage', impute_hasGarage),
    ('city_median_price', city_median_price),
    ('city_mean_price', city_mean_price),
    ('county_median_price', county_median_price),
    ('county_mean_price', county_mean_price),
    ('5_knn_median_price', _5_knn_median_price),
    ('5_knn_mean_price', _5_knn_mean_price),
    ('25_knn_median_price', _25_knn_median_price),
    ('25_knn_mean_price', _25_knn_mean_price),
    ('encoding', encoding),
    ('drop_features', drop_features)
]

# Pipelines

In [9]:
# Definir la pipeline
preprocessing_pipeline = Pipeline(preprocessing_steps)

# Aplicar la pipeline a tus datos
train_transformed = preprocessing_pipeline.fit_transform(train.copy())

train_transformed.to_parquet(f'{PROCESSED_DATA_PATH}/transformed_dataset.parquet', index=False)

# Modeling

In [10]:
model = train_lightgbm_model(train_transformed, target)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2679
[LightGBM] [Info] Number of data points in the train set: 18844, number of used features: 22
[LightGBM] [Info] Start training from score 947213.509340
RMSE: 466212
MSE : 217353982251
MAE : 253510
MeAE: 120081
MAPE: 0
R2  : 0.717


# Evaluate on testing

In [11]:
test_transformed = preprocessing_pipeline.transform(test.copy())

In [12]:
test_preds = model.predict(test_transformed.drop(columns='price'))
test_actual = test_transformed['price']

In [13]:
test_rmse = np.sqrt(np.mean((test_actual - test_preds)**2))
test_mse = np.mean((test_actual - test_preds)**2)
test_mae = np.mean(np.abs(test_actual - test_preds))
test_median_absolute_error = np.median(np.abs(test_actual - test_preds))
test_mape = mape_score(test_actual, test_preds)
test_r2 = np.corrcoef(test_actual, test_preds)[0, 1]**2

print(f'RMSE: {int(test_rmse)}')
print(f'MSE : {int(test_mse)}')
print(f'MAE : {int(test_mae)}')
print(f'MeAE: {int(test_median_absolute_error)}')
print(f'MAPE: {round(test_mape, 3)}')
print(f'R2  : {round(test_r2, 3)}')

RMSE: 488202
MSE : 238341751162
MAE : 300885
MeAE: 174574
MAPE: 0.798
R2  : 0.701


# Optuna

In [14]:
from utils.optimizer import optimize_lightgbm_params
from utils.train import train_lightgbm_model

In [15]:
# Uso de la función
# Supongamos que tienes un DataFrame df y el objetivo es predecir la columna 'target'
best_params = optimize_lightgbm_params(train_transformed.drop(target, axis=1), df[target], n_trials=25)
print(best_params)

[I 2024-01-03 16:12:33,500] A new study created in memory with name: no-name-1dcf19ec-2b8d-4230-826e-c8c46b8977c3
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.2),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-9, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-9, 10.0),
[I 2024-01-03 16:13:00,436] Trial 0 finished with value: 777338484113.9822 and parameters: {'num_leaves': 220, 'learning_rate': 0.017136310694721785, 'feature_fraction': 0.18170982265262337, 'bagging_fraction': 0.5130962283448438, 'bagging_freq': 3, 'reg_alpha': 0.0010983976345443557, 'reg_lambda': 0.0345532459300555}. Best is trial 0 with value: 777338484113.9822.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.2),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
  'bagging_

{'num_leaves': 76, 'learning_rate': 0.006811460350462389, 'feature_fraction': 0.8159980668088358, 'bagging_fraction': 0.8410815831569212, 'bagging_freq': 7, 'reg_alpha': 1.6696230103822778e-07, 'reg_lambda': 0.000519735746205558}


In [16]:
model = train_lightgbm_model(train_transformed, target, best_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2679
[LightGBM] [Info] Number of data points in the train set: 18844, number of used features: 22
[LightGBM] [Info] Start training from score 947213.509340
RMSE: 470836
MSE : 221686669962
MAE : 254149
MeAE: 122043
MAPE: 0
R2  : 0.711


In [20]:
X_test = test_transformed.drop(columns='price')
y_test = test_transformed['price']

In [21]:
test_preds = model.predict(X_test)
test_rmse = np.sqrt(np.mean((test_actual - test_preds)**2))
test_mse = np.mean((test_actual - test_preds)**2)
test_mae = np.mean(np.abs(test_actual - test_preds))
test_median_absolute_error = np.median(np.abs(test_actual - test_preds))
test_r2 = np.corrcoef(test_actual, test_preds)[0, 1]**2

print(f'RMSE: {int(test_rmse)}')
print(f'MSE : {int(test_mse)}')
print(f'MAE : {int(test_mae)}')
print(f'MeAE: {int(test_median_absolute_error)}')
print(f'R2  : {round(test_r2, 3)}')

RMSE: 465997
MSE : 217153513420
MAE : 270717
MeAE: 139207
R2  : 0.721
