In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error



# Def modeling: regresiones, modelos ML

In [2]:
def modeling(data,models,version,iteracion):
    results_dict = {
    'Version_data': version,
    'Version_iteracion': iteracion,
    'State': [],
    'Modelo': [],
    'Parameters': [],
    'R2': [],
    'MAE': [],
    'RMSE': []
}
    states = data['State'].unique()

    for state in states:
        data_state = data[data['State'] == state]
        X = data_state.drop(['MedianPrice', 'State'], axis=1)
        y = data_state['MedianPrice'] 
        
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        for name, model, params in models:
            pipeline = Pipeline([
                ('scaler', StandardScaler()), 
                ('model', model)
            ])

            grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='r2')
            grid_search.fit(X_scaled, y)

            y_pred = grid_search.predict(X_scaled)
            mae = mean_absolute_error(y, y_pred)
            rmse = mean_squared_error(y, y_pred, squared=False)

            results_dict['State'].append(state)
            results_dict['Modelo'].append(name)
            results_dict['Parameters'].append(grid_search.best_params_)
            results_dict['R2'].append(grid_search.best_score_)
            results_dict['MAE'].append(mae)
            results_dict['RMSE'].append(rmse)

    results_df = pd.DataFrame(results_dict)
    best_models_df = results_df.loc[results_df.groupby('State')['R2'].idxmax()]

    return best_models_df

In [3]:
models = [
    ('SVR', SVR(), {'model__C': [0.1, 1, 10], 'model__gamma': [0.1, 0.01, 0.001]}),
    ('KNeighborsRegressor', KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7],'model__weights': ['uniform', 'distance']}),
    ('Lasso', Lasso(), {'model__alpha': [0.1, 0.5, 1.0]}),
    ('LinearRegression', LinearRegression(), {}),
    ('KNN', KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7]}),
    ('RandomForest', RandomForestRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('GradientBoosting', GradientBoostingRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('AdaBoost', AdaBoostRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('XGBoost', XGBRegressor(), {'model__n_estimators': [50, 100, 150]})
]

# 1. Models all pca

In [4]:
data_pca = pd.read_csv('../data/process_data/all_pca.csv')

In [5]:
models_pca = modeling(data = data_pca,models = models,version = 'all_pca', iteracion='iter1-normal')
models_pca

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
5,all_pca,iter1-normal,FL,RandomForest,{'model__n_estimators': 50},0.375347,15149.451613,24454.906091
59,all_pca,iter1-normal,GA,RandomForest,{'model__n_estimators': 100},0.780612,6906.866242,9479.686978
32,all_pca,iter1-normal,NC,RandomForest,{'model__n_estimators': 150},0.579183,10092.545455,13717.572463
12,all_pca,iter1-normal,NJ,LinearRegression,{},0.595462,31778.200052,35635.501147
24,all_pca,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 50},0.800524,5695.664416,7399.592826
44,all_pca,iter1-normal,SC,XGBoost,{'model__n_estimators': 100},0.701486,0.017188,0.024122
50,all_pca,iter1-normal,VA,RandomForest,{'model__n_estimators': 50},0.841174,9015.595506,14142.744028


# 2. Models all log

In [6]:
data_log = pd.read_csv('../data/process_data/all_log.csv')

In [7]:
models_log = modeling(data = data_log,models = models,version = 'all_log',iteracion='iter1-normal')
models_log

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
6,all_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.796257,125.193771,159.814824
57,all_log,iter1-normal,GA,LinearRegression,{},0.825087,13157.467104,16222.984727
30,all_log,iter1-normal,NC,LinearRegression,{},0.868397,10242.503225,13541.98527
16,all_log,iter1-normal,NJ,AdaBoost,{'model__n_estimators': 100},0.53402,4898.027778,6671.579904
24,all_log,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 150},0.885118,181.423916,227.74502
41,all_log,iter1-normal,SC,RandomForest,{'model__n_estimators': 50},0.805469,6818.4,10484.321925
51,all_log,iter1-normal,VA,GradientBoosting,{'model__n_estimators': 50},0.866787,4293.007636,5515.274653


# 3. Models new values

In [8]:
data_new_values = pd.read_csv('../data/process_data/new_values.csv')

In [9]:
models_new_values = modeling(data = data_new_values,models = models,version = 'new_values',iteracion='iter1-normal')
models_new_values

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
6,new_values,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.79793,126.93948,149.012802
56,new_values,iter1-normal,GA,Lasso,{'model__alpha': 1.0},0.878805,12267.590107,15930.705726
30,new_values,iter1-normal,NC,LinearRegression,{},0.911957,10052.628324,12590.826233
10,new_values,iter1-normal,NJ,KNeighborsRegressor,"{'model__n_neighbors': 3, 'model__weights': 'u...",0.452062,31431.666667,37522.367403
24,new_values,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 100},0.899687,370.935292,443.085844
38,new_values,iter1-normal,SC,Lasso,{'model__alpha': 1.0},0.835408,8479.319918,10491.318644
52,new_values,iter1-normal,VA,AdaBoost,{'model__n_estimators': 150},0.887729,14866.883071,17302.998528


# 4. Models new values log

In [10]:
data_new_values_log = pd.read_csv('../data/process_data/new_values_log.csv')

In [11]:
models_new_values_log = modeling(data = data_new_values_log,models = models,version = 'new_values_log',
iteracion='iter1-normal')
models_new_values_log

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
6,new_values_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.792581,231.957811,281.239279
56,new_values_log,iter1-normal,GA,Lasso,{'model__alpha': 1.0},0.810443,16130.218802,20637.769007
29,new_values_log,iter1-normal,NC,Lasso,{'model__alpha': 1.0},0.843991,14464.243178,18002.906382
16,new_values_log,iter1-normal,NJ,AdaBoost,{'model__n_estimators': 50},0.380502,4658.988095,7524.143128
24,new_values_log,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 150},0.899639,171.698639,221.469462
41,new_values_log,iter1-normal,SC,RandomForest,{'model__n_estimators': 100},0.803412,6399.4,9715.786308
50,new_values_log,iter1-normal,VA,RandomForest,{'model__n_estimators': 150},0.775796,10474.613636,15769.429278


# 5. Get results

In [12]:
frames = [models_pca, models_log, models_new_values, models_new_values_log]
models_result = pd.concat(frames)

models_result = models_result.reset_index(drop=True).sort_values(by=['State','R2'], ascending = [True,False])

models_result

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
14,new_values,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.79793,126.93948,149.012802
7,all_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.796257,125.193771,159.814824
21,new_values_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.792581,231.957811,281.239279
0,all_pca,iter1-normal,FL,RandomForest,{'model__n_estimators': 50},0.375347,15149.451613,24454.906091
15,new_values,iter1-normal,GA,Lasso,{'model__alpha': 1.0},0.878805,12267.590107,15930.705726
8,all_log,iter1-normal,GA,LinearRegression,{},0.825087,13157.467104,16222.984727
22,new_values_log,iter1-normal,GA,Lasso,{'model__alpha': 1.0},0.810443,16130.218802,20637.769007
1,all_pca,iter1-normal,GA,RandomForest,{'model__n_estimators': 100},0.780612,6906.866242,9479.686978
16,new_values,iter1-normal,NC,LinearRegression,{},0.911957,10052.628324,12590.826233
9,all_log,iter1-normal,NC,LinearRegression,{},0.868397,10242.503225,13541.98527


In [13]:
models_result.to_excel('../data/final_data/models_results.xlsx', index = False)