In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
import time

# Def modeling

In [18]:
def modeling(data,models,version,iteracion):
    results_dict = {
    'Version_data': version,
    'Version_iteracion': iteracion,
    'State': [],
    'Modelo': [],
    'Mejores Parámetros': [],
    'Mejor Puntuación (r2)': [],
    'Tiempo de Ejecución': []
}
    states = data['State'].unique()
    X = data.drop(['MedianPrice', 'State'], axis=1)
    y = data['MedianPrice']
    
    for state in states:
        X_state = X[data['State'] == state]
        y_state = y[data['State'] == state]

        for name, model, params in models:
            start_time = time.time()  # Tiempo inicial
            pipeline = Pipeline([
                ('model', model)
            ])
            grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='r2')
            grid_search.fit(X, y)
            end_time = time.time()  # Tiempo final
            elapsed_time = end_time - start_time  # Tiempo transcurrido

            results_dict['State'].append(state)
            results_dict['Modelo'].append(name)
            results_dict['Mejores Parámetros'].append(grid_search.best_params_)
            results_dict['Mejor Puntuación (r2)'].append(grid_search.best_score_)
            results_dict['Tiempo de Ejecución'].append(elapsed_time)
            
    results_df = pd.DataFrame(results_dict)
    best_models_df = results_df.loc[results_df.groupby('State')['Mejor Puntuación (r2)'].idxmax()]

    return best_models_df

In [19]:
models = [
    ('Lasso', Lasso(), {'model__alpha': [0.1, 0.5, 1.0]}),
    ('LinearRegression', LinearRegression(), {}),
    ('KNN', KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7]}),
    ('RandomForest', RandomForestRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('GradientBoosting', GradientBoostingRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('AdaBoost', AdaBoostRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('XGBoost', XGBRegressor(), {'model__n_estimators': [50, 100, 150]})
]

# 1. Models all pca

In [20]:
data_pca = pd.read_csv('../data/process_data/all_pca.csv')

In [22]:
models_pca = modeling(data = data_pca,models = models,version = 'all_pca', iteracion='iter1-normal')
models_pca

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
4,all_pca,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 50},0.646406,0.726299
46,all_pca,iter1-normal,GA,GradientBoosting,{'model__n_estimators': 50},0.648214,0.70365
25,all_pca,iter1-normal,NC,GradientBoosting,{'model__n_estimators': 50},0.647627,0.708228
11,all_pca,iter1-normal,NJ,GradientBoosting,{'model__n_estimators': 50},0.648352,0.835715
18,all_pca,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 50},0.647684,0.729811
32,all_pca,iter1-normal,SC,GradientBoosting,{'model__n_estimators': 50},0.648452,0.77721
39,all_pca,iter1-normal,VA,GradientBoosting,{'model__n_estimators': 50},0.646546,0.739217


# 2. Models all log

In [23]:
data_log = pd.read_csv('../data/process_data/all_log.csv')

In [24]:
models_log = modeling(data = data_log,models = models,version = 'all_log',iteracion='iter1-normal')
models_log

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
4,all_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.860846,3.134164
46,all_log,iter1-normal,GA,GradientBoosting,{'model__n_estimators': 150},0.862765,3.397843
25,all_log,iter1-normal,NC,GradientBoosting,{'model__n_estimators': 150},0.861048,3.114346
11,all_log,iter1-normal,NJ,GradientBoosting,{'model__n_estimators': 100},0.862638,3.006616
18,all_log,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 150},0.862695,3.10111
32,all_log,iter1-normal,SC,GradientBoosting,{'model__n_estimators': 150},0.862045,3.271307
39,all_log,iter1-normal,VA,GradientBoosting,{'model__n_estimators': 150},0.861768,3.273246


# 3. Models new values

In [25]:
data_new_values = pd.read_csv('../data/process_data/new_values.csv')

In [26]:
models_new_values = modeling(data = data_new_values,models = models,version = 'new_values',iteracion='iter1-normal')
models_new_values

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
4,new_values,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.853817,2.327777
46,new_values,iter1-normal,GA,GradientBoosting,{'model__n_estimators': 100},0.852047,2.113591
25,new_values,iter1-normal,NC,GradientBoosting,{'model__n_estimators': 100},0.848437,2.251486
11,new_values,iter1-normal,NJ,GradientBoosting,{'model__n_estimators': 100},0.848907,2.177605
18,new_values,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 100},0.849977,2.09667
32,new_values,iter1-normal,SC,GradientBoosting,{'model__n_estimators': 100},0.849707,2.20126
39,new_values,iter1-normal,VA,GradientBoosting,{'model__n_estimators': 100},0.85253,2.104668


# 4. Models new values log

In [27]:
data_new_values_log = pd.read_csv('../data/process_data/new_values_log.csv')

In [28]:
models_new_values_log = modeling(data = data_new_values_log,models = models,version = 'new_values_log',
iteracion='iter1-normal')
models_new_values_log

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
4,new_values_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.838053,1.797416
46,new_values_log,iter1-normal,GA,GradientBoosting,{'model__n_estimators': 150},0.83565,1.774749
25,new_values_log,iter1-normal,NC,GradientBoosting,{'model__n_estimators': 150},0.837281,1.780355
11,new_values_log,iter1-normal,NJ,GradientBoosting,{'model__n_estimators': 50},0.832451,1.662374
18,new_values_log,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 100},0.837638,1.711322
32,new_values_log,iter1-normal,SC,GradientBoosting,{'model__n_estimators': 100},0.835284,1.716173
39,new_values_log,iter1-normal,VA,GradientBoosting,{'model__n_estimators': 150},0.834602,1.76315


# 5. Get results

In [29]:
frames = [models_pca, models_log, models_new_values, models_new_values_log]
models_result = pd.concat(frames)

models_result = models_result.reset_index(drop=True)

models_result

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
0,all_pca,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 50},0.646406,0.726299
1,all_pca,iter1-normal,GA,GradientBoosting,{'model__n_estimators': 50},0.648214,0.70365
2,all_pca,iter1-normal,NC,GradientBoosting,{'model__n_estimators': 50},0.647627,0.708228
3,all_pca,iter1-normal,NJ,GradientBoosting,{'model__n_estimators': 50},0.648352,0.835715
4,all_pca,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 50},0.647684,0.729811
5,all_pca,iter1-normal,SC,GradientBoosting,{'model__n_estimators': 50},0.648452,0.77721
6,all_pca,iter1-normal,VA,GradientBoosting,{'model__n_estimators': 50},0.646546,0.739217
7,all_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.860846,3.134164
8,all_log,iter1-normal,GA,GradientBoosting,{'model__n_estimators': 150},0.862765,3.397843
9,all_log,iter1-normal,NC,GradientBoosting,{'model__n_estimators': 150},0.861048,3.114346


In [30]:
models_result.to_excel('../data/final_data/bests_models_results.xlsx', index = False)