In [10]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
import time

# 1. Get Data: all pca

In [2]:
data = pd.read_csv('../data/process_data/all_pca.csv')

## 1.1 Definir las variables

In [3]:
X = data.drop(['MedianPrice', 'State'], axis=1)
y = data['MedianPrice']

In [4]:
states = data['State'].unique()

# 2. Definir modelos

In [5]:
models = [
    ('Lasso', Lasso(), {'model__alpha': [0.1, 0.5, 1.0]}),
    ('LinearRegression', LinearRegression(), {}),
    ('KNN', KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7]}),
    ('RandomForest', RandomForestRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('GradientBoosting', GradientBoostingRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('AdaBoost', AdaBoostRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('XGBoost', XGBRegressor(), {'model__n_estimators': [50, 100, 150]})
]

In [23]:
results_dict = {
    'State': [],
    'Modelo': [],
    'Mejores Parámetros': [],
    'Mejor Puntuación (r2)': [],
    'Tiempo de Ejecución': []
}

# 3. Modeling

In [24]:
for state in states:
    X_state = X[data['State'] == state]
    y_state = y[data['State'] == state]

    for name, model, params in models:
        start_time = time.time()  # Tiempo inicial
        pipeline = Pipeline([
            ('model', model)
        ])
        grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='r2')
        grid_search.fit(X, y)
        end_time = time.time()  # Tiempo final
        elapsed_time = end_time - start_time  # Tiempo transcurrido

        results_dict['State'].append(state)
        results_dict['Modelo'].append(name)
        results_dict['Mejores Parámetros'].append(grid_search.best_params_)
        results_dict['Mejor Puntuación (r2)'].append(grid_search.best_score_)
        results_dict['Tiempo de Ejecución'].append(elapsed_time)

In [27]:
results_df = pd.DataFrame(results_dict)
results_df.head()

Unnamed: 0,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
0,FL,Lasso,{'model__alpha': 1.0},0.516362,0.059416
1,FL,LinearRegression,{},0.51635,0.019102
2,FL,KNN,{'model__n_neighbors': 7},0.624617,0.046111
3,FL,RandomForest,{'model__n_estimators': 100},0.624884,2.120383
4,FL,GradientBoosting,{'model__n_estimators': 50},0.648862,0.703188


In [26]:
best_models_df = results_df.loc[results_df.groupby('State')['Mejor Puntuación (r2)'].idxmax()]
best_models_df

Unnamed: 0,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
4,FL,GradientBoosting,{'model__n_estimators': 50},0.648862,0.703188
46,GA,GradientBoosting,{'model__n_estimators': 50},0.648683,0.704511
25,NC,GradientBoosting,{'model__n_estimators': 50},0.648704,0.713197
11,NJ,GradientBoosting,{'model__n_estimators': 50},0.647144,0.713036
18,NY,GradientBoosting,{'model__n_estimators': 50},0.647918,0.707374
32,SC,GradientBoosting,{'model__n_estimators': 50},0.645786,0.717439
39,VA,GradientBoosting,{'model__n_estimators': 50},0.647695,0.704045


# 2. Get data: all log

In [28]:
data = pd.read_csv('../data/process_data/all_log.csv')

In [29]:
X = data.drop(['MedianPrice', 'State'], axis=1)
y = data['MedianPrice']

In [30]:
for state in states:
    X_state = X[data['State'] == state]
    y_state = y[data['State'] == state]

    for name, model, params in models:
        start_time = time.time()  # Tiempo inicial
        pipeline = Pipeline([
            ('model', model)
        ])
        grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='r2')
        grid_search.fit(X, y)
        end_time = time.time()  # Tiempo final
        elapsed_time = end_time - start_time  # Tiempo transcurrido

        results_dict['State'].append(state)
        results_dict['Modelo'].append(name)
        results_dict['Mejores Parámetros'].append(grid_search.best_params_)
        results_dict['Mejor Puntuación (r2)'].append(grid_search.best_score_)
        results_dict['Tiempo de Ejecución'].append(elapsed_time)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [31]:
results_df = pd.DataFrame(results_dict)
results_df.head()

Unnamed: 0,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
0,FL,Lasso,{'model__alpha': 1.0},0.516362,0.059416
1,FL,LinearRegression,{},0.51635,0.019102
2,FL,KNN,{'model__n_neighbors': 7},0.624617,0.046111
3,FL,RandomForest,{'model__n_estimators': 100},0.624884,2.120383
4,FL,GradientBoosting,{'model__n_estimators': 50},0.648862,0.703188


In [32]:
best_models_df = results_df.loc[results_df.groupby('State')['Mejor Puntuación (r2)'].idxmax()]
best_models_df

Unnamed: 0,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
53,FL,GradientBoosting,{'model__n_estimators': 150},0.860244,3.204129
95,GA,GradientBoosting,{'model__n_estimators': 150},0.862373,3.159211
74,NC,GradientBoosting,{'model__n_estimators': 100},0.859925,3.062707
60,NJ,GradientBoosting,{'model__n_estimators': 100},0.861847,3.400519
67,NY,GradientBoosting,{'model__n_estimators': 100},0.860331,3.600708
81,SC,GradientBoosting,{'model__n_estimators': 150},0.862052,3.164395
88,VA,GradientBoosting,{'model__n_estimators': 100},0.861049,3.045958


# 3. Get data: new values

In [43]:
data = pd.read_csv('../data/process_data/new_values.csv')

In [39]:
X = data.drop(['MedianPrice', 'State'], axis=1)
y = data['MedianPrice']

In [40]:
for state in states:
    X_state = X[data['State'] == state]
    y_state = y[data['State'] == state]

    for name, model, params in models:
        start_time = time.time()  # Tiempo inicial
        pipeline = Pipeline([
            ('model', model)
        ])
        grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='r2')
        grid_search.fit(X, y)
        end_time = time.time()  # Tiempo final
        elapsed_time = end_time - start_time  # Tiempo transcurrido

        results_dict['State'].append(state)
        results_dict['Modelo'].append(name)
        results_dict['Mejores Parámetros'].append(grid_search.best_params_)
        results_dict['Mejor Puntuación (r2)'].append(grid_search.best_score_)
        results_dict['Tiempo de Ejecución'].append(elapsed_time)

In [41]:
results_df = pd.DataFrame(results_dict)
results_df.head()

Unnamed: 0,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
0,FL,Lasso,{'model__alpha': 1.0},0.516362,0.059416
1,FL,LinearRegression,{},0.51635,0.019102
2,FL,KNN,{'model__n_neighbors': 7},0.624617,0.046111
3,FL,RandomForest,{'model__n_estimators': 100},0.624884,2.120383
4,FL,GradientBoosting,{'model__n_estimators': 50},0.648862,0.703188


In [42]:
best_models_df = results_df.loc[results_df.groupby('State')['Mejor Puntuación (r2)'].idxmax()]
best_models_df

Unnamed: 0,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
53,FL,GradientBoosting,{'model__n_estimators': 150},0.860244,3.204129
95,GA,GradientBoosting,{'model__n_estimators': 150},0.862373,3.159211
74,NC,GradientBoosting,{'model__n_estimators': 100},0.859925,3.062707
60,NJ,GradientBoosting,{'model__n_estimators': 100},0.861847,3.400519
67,NY,GradientBoosting,{'model__n_estimators': 100},0.860331,3.600708
81,SC,GradientBoosting,{'model__n_estimators': 150},0.862052,3.164395
88,VA,GradientBoosting,{'model__n_estimators': 100},0.861049,3.045958
