In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error



# Def modeling: regresiones, modelos ML

In [2]:
def modeling(data,models,version,iteracion):
    results_dict = {
    'Version_data': version,
    'Version_iteracion': iteracion,
    'State': [],
    'Modelo': [],
    'Parameters': [],
    'R2': [],
    'MAE': [],
    'RMSE': []
}
    states = data['State'].unique()

    for state in states:
        data_state = data[data['State'] == state]
        X = data_state.drop(['MedianPrice', 'State'], axis=1)
        y = data_state['MedianPrice'] 
        
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        for name, model, params in models:
            pipeline = Pipeline([
                ('scaler', StandardScaler()), 
                ('model', model)
            ])

            grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='r2')
            grid_search.fit(X_scaled, y)

            y_pred = grid_search.predict(X_scaled)
            mae = mean_absolute_error(y, y_pred)
            rmse = mean_squared_error(y, y_pred, squared=False)

            results_dict['State'].append(state)
            results_dict['Modelo'].append(name)
            results_dict['Parameters'].append(grid_search.best_params_)
            results_dict['R2'].append(grid_search.best_score_)
            results_dict['MAE'].append(mae)
            results_dict['RMSE'].append(rmse)

    results_df = pd.DataFrame(results_dict)
    best_models_df = results_df.loc[results_df.groupby('State')['R2'].idxmax()]

    return best_models_df

In [3]:
models = [
    ('SVR', SVR(), {'model__C': [0.1, 1, 10], 'model__gamma': [0.1, 0.01, 0.001]}),
    ('KNeighborsRegressor', KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7],'model__weights': ['uniform', 'distance']}),
    ('Lasso', Lasso(), {'model__alpha': [0.1, 0.5, 1.0]}),
    ('LinearRegression', LinearRegression(), {}),
    ('KNN', KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7]}),
    ('RandomForest', RandomForestRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('GradientBoosting', GradientBoostingRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('AdaBoost', AdaBoostRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('XGBoost', XGBRegressor(), {'model__n_estimators': [50, 100, 150]})
]

# 1. Models all pca

In [4]:
data_pca = pd.read_csv('../data/process_data/all_pca.csv')

In [5]:
models_pca = modeling(data = data_pca,models = models,version = 'all_pca', iteracion='iter1-normal')
models_pca

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
5,all_pca,iter1-normal,FL,RandomForest,{'model__n_estimators': 150},0.402307,14906.688172,22780.574768
59,all_pca,iter1-normal,GA,RandomForest,{'model__n_estimators': 50},0.781493,6901.503185,9199.355751
32,all_pca,iter1-normal,NC,RandomForest,{'model__n_estimators': 100},0.588117,10209.858586,13807.430095
12,all_pca,iter1-normal,NJ,LinearRegression,{},0.595462,31778.200052,35635.501147
24,all_pca,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 50},0.801728,5695.664416,7399.592826
44,all_pca,iter1-normal,SC,XGBoost,{'model__n_estimators': 100},0.701486,0.017188,0.024122
50,all_pca,iter1-normal,VA,RandomForest,{'model__n_estimators': 50},0.843262,9677.146067,14954.267899


# 2. Models all log

In [6]:
data_log = pd.read_csv('../data/process_data/all_log.csv')

In [7]:
models_log = modeling(data = data_log,models = models,version = 'all_log',iteracion='iter1-normal')
models_log

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
6,all_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.781811,125.193771,159.814824
57,all_log,iter1-normal,GA,LinearRegression,{},0.825087,13157.467104,16222.984727
30,all_log,iter1-normal,NC,LinearRegression,{},0.868397,10242.503225,13541.98527
16,all_log,iter1-normal,NJ,AdaBoost,{'model__n_estimators': 150},0.576942,4269.217949,6320.105363
24,all_log,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 50},0.884365,2432.939133,3007.131967
41,all_log,iter1-normal,SC,RandomForest,{'model__n_estimators': 150},0.811089,6352.02963,10452.443832
51,all_log,iter1-normal,VA,GradientBoosting,{'model__n_estimators': 50},0.868932,4293.007636,5515.274653


# 3. Models new values

In [8]:
data_new_values = pd.read_csv('../data/process_data/new_values.csv')

In [9]:
models_new_values = modeling(data = data_new_values,models = models,version = 'new_values',iteracion='iter1-normal')
models_new_values

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
6,new_values,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.779057,168.839807,220.776706
56,new_values,iter1-normal,GA,Lasso,{'model__alpha': 1.0},0.878223,12165.527847,15611.276165
30,new_values,iter1-normal,NC,LinearRegression,{},0.861908,11048.683667,14440.910867
16,new_values,iter1-normal,NJ,AdaBoost,{'model__n_estimators': 50},0.511197,5057.678571,7364.584352
24,new_values,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 150},0.89658,130.374807,163.83732
38,new_values,iter1-normal,SC,Lasso,{'model__alpha': 1.0},0.858107,7297.583055,9056.313126
52,new_values,iter1-normal,VA,AdaBoost,{'model__n_estimators': 100},0.864547,13328.582656,15573.857944


# 4. Models new values log

In [12]:
data_new_values_log = pd.read_csv('../data/process_data/new_values_log.csv')

In [13]:
models_new_values_log = modeling(data = data_new_values_log,models = models,version = 'new_values_log',
iteracion='iter1-normal')
models_new_values_log

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
5,new_values_log,iter1-normal,FL,RandomForest,{'model__n_estimators': 50},0.774959,8436.806452,17191.242285
59,new_values_log,iter1-normal,GA,RandomForest,{'model__n_estimators': 150},0.816276,6063.605096,8093.136806
29,new_values_log,iter1-normal,NC,Lasso,{'model__alpha': 1.0},0.847543,12747.336619,16393.351508
16,new_values_log,iter1-normal,NJ,AdaBoost,{'model__n_estimators': 100},0.510045,4656.982143,6669.691264
24,new_values_log,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 100},0.893538,678.748875,848.142881
41,new_values_log,iter1-normal,SC,RandomForest,{'model__n_estimators': 50},0.808274,6157.955556,9534.816043
51,new_values_log,iter1-normal,VA,GradientBoosting,{'model__n_estimators': 150},0.869694,655.799287,814.995361


# 5. Get results

In [14]:
frames = [models_pca, models_log, models_new_values, models_new_values_log]
models_result = pd.concat(frames)

models_result = models_result.reset_index(drop=True).sort_values(by=['State','R2'], ascending = [True,False])

models_result

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
7,all_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.781811,125.193771,159.814824
14,new_values,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.779057,168.839807,220.776706
21,new_values_log,iter1-normal,FL,RandomForest,{'model__n_estimators': 50},0.774959,8436.806452,17191.242285
0,all_pca,iter1-normal,FL,RandomForest,{'model__n_estimators': 150},0.402307,14906.688172,22780.574768
15,new_values,iter1-normal,GA,Lasso,{'model__alpha': 1.0},0.878223,12165.527847,15611.276165
8,all_log,iter1-normal,GA,LinearRegression,{},0.825087,13157.467104,16222.984727
22,new_values_log,iter1-normal,GA,RandomForest,{'model__n_estimators': 150},0.816276,6063.605096,8093.136806
1,all_pca,iter1-normal,GA,RandomForest,{'model__n_estimators': 50},0.781493,6901.503185,9199.355751
9,all_log,iter1-normal,NC,LinearRegression,{},0.868397,10242.503225,13541.98527
16,new_values,iter1-normal,NC,LinearRegression,{},0.861908,11048.683667,14440.910867


In [15]:
models_result.to_excel('../data/final_data/models_results.xlsx', index = False)