In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error



# Modeling: regresiones, modelos ML con datos cov

In [2]:
def modeling(data,models,version,iteracion):
    results_dict = {
    'Version_data': version,
    'Version_iteracion': iteracion,
    'State': [],
    'Modelo': [],
    'Parameters': [],
    'R2': [],
    'MAE': [],
    'RMSE': []
}
    states = data['State'].unique()

    for state in states:
        data_state = data[data['State'] == state]
        X = data_state.drop(['MedianPrice', 'State'], axis=1)
        y = data_state['MedianPrice'] 
        
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        for name, model, params in models:
            pipeline = Pipeline([
                ('scaler', StandardScaler()), 
                ('model', model)
            ])

            grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='r2')
            grid_search.fit(X_scaled, y)

            y_pred = grid_search.predict(X_scaled)
            mae = mean_absolute_error(y, y_pred)
            rmse = mean_squared_error(y, y_pred, squared=False)

            results_dict['State'].append(state)
            results_dict['Modelo'].append(name)
            results_dict['Parameters'].append(grid_search.best_params_)
            results_dict['R2'].append(grid_search.best_score_)
            results_dict['MAE'].append(mae)
            results_dict['RMSE'].append(rmse)

    results_df = pd.DataFrame(results_dict)
    best_models_df = results_df.loc[results_df.groupby('State')['R2'].idxmax()]

    return best_models_df

In [3]:
models = [
    ('SVR', SVR(), {'model__C': [0.1, 1, 10], 'model__gamma': [0.1, 0.01, 0.001]}),
    ('KNeighborsRegressor', KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7],'model__weights': ['uniform', 'distance']}),
    ('Lasso', Lasso(), {'model__alpha': [0.1, 0.5, 1.0]}),
    ('LinearRegression', LinearRegression(), {}),
    ('KNN', KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7]}),
    ('RandomForest', RandomForestRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('GradientBoosting', GradientBoostingRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('AdaBoost', AdaBoostRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('XGBoost', XGBRegressor(), {'model__n_estimators': [50, 100, 150]})
]

# 1. Models all pca

In [4]:
data_pca = pd.read_csv('../data/process_data/cov_all_pca.csv')

In [5]:
models_pca = modeling(data = data_pca,models = models,version = 'all_pca', iteracion='iter2-cov')
models_pca

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
3,all_pca,iter2-cov,FL,LinearRegression,{},0.359127,41654.981228,61610.112726
14,all_pca,iter2-cov,GA,RandomForest,{'model__n_estimators': 100},0.783189,6890.929936,9632.263812
23,all_pca,iter2-cov,NC,RandomForest,{'model__n_estimators': 150},0.593113,10096.289562,13555.228835
30,all_pca,iter2-cov,NJ,LinearRegression,{},0.595462,31778.200052,35635.501147
42,all_pca,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 150},0.71415,905.391914,1168.007783
53,all_pca,iter2-cov,SC,XGBoost,{'model__n_estimators': 100},0.701486,0.017188,0.024122
59,all_pca,iter2-cov,VA,RandomForest,{'model__n_estimators': 100},0.843227,9131.94382,15302.51075


# 2. Models all log

In [6]:
data_log = pd.read_csv('../data/process_data/cov_all_log.csv')

In [7]:
models_log = modeling(data = data_log,models = models,version = 'all_log',iteracion='iter2-cov')
models_log

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
6,all_log,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 150},0.790839,125.193771,159.814824
12,all_log,iter2-cov,GA,LinearRegression,{},0.851904,12861.848128,15928.141442
21,all_log,iter2-cov,NC,LinearRegression,{},0.868628,10247.423473,13541.573294
34,all_log,iter2-cov,NJ,AdaBoost,{'model__n_estimators': 150},0.5004,4779.97619,7428.495002
42,all_log,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 150},0.873477,116.930643,148.92499
50,all_log,iter2-cov,SC,RandomForest,{'model__n_estimators': 100},0.828457,6059.755556,8819.669937
62,all_log,iter2-cov,VA,XGBoost,{'model__n_estimators': 150},0.877512,0.02218,0.033156


# 3. Models new values

In [8]:
data_new_values = pd.read_csv('../data/process_data/cov_new_values.csv')

In [9]:
models_new_values = modeling(data = data_new_values,models = models,version = 'new_values',iteracion='iter2-cov')
models_new_values

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
5,new_values,iter2-cov,FL,RandomForest,{'model__n_estimators': 50},0.767749,8558.709677,13502.383622
11,new_values,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.878223,12165.527847,15611.276165
21,new_values,iter2-cov,NC,LinearRegression,{},0.832725,11094.437692,14529.458861
34,new_values,iter2-cov,NJ,AdaBoost,{'model__n_estimators': 50},0.523675,4109.75,6892.741777
42,new_values,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 100},0.898738,502.854126,604.001534
47,new_values,iter2-cov,SC,Lasso,{'model__alpha': 1.0},0.858107,7297.583055,9056.313126
56,new_values,iter2-cov,VA,Lasso,{'model__alpha': 1.0},0.9056,13941.22119,17586.490043


# 4. Models new values log

In [10]:
data_new_values_log = pd.read_csv('../data/process_data/cov_new_values_log.csv')

In [11]:
models_new_values_log = modeling(data = data_new_values_log,models = models,version = 'new_values_log',
iteracion='iter2-cov')
models_new_values_log

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
6,new_values_log,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 50},0.79911,1972.90581,2408.717196
11,new_values_log,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.881413,11931.292662,15234.031189
21,new_values_log,iter2-cov,NC,LinearRegression,{},0.840917,11190.1559,14542.430386
34,new_values_log,iter2-cov,NJ,AdaBoost,{'model__n_estimators': 50},0.557757,5126.651515,7297.975488
42,new_values_log,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 100},0.8961,502.854126,604.001534
47,new_values_log,iter2-cov,SC,Lasso,{'model__alpha': 1.0},0.858107,7297.583055,9056.313126
56,new_values_log,iter2-cov,VA,Lasso,{'model__alpha': 1.0},0.916111,13800.867247,17486.793986


# 5. Get results

In [12]:
frames = [models_pca, models_log, models_new_values, models_new_values_log]
models_result = pd.concat(frames)

In [13]:
previous_models_result = pd.read_excel('../data/final_data/models_results.xlsx')
models_result = pd.concat([models_result, previous_models_result], ignore_index = True)

models_result = models_result.reset_index(drop=True).sort_values(by=['State','R2'], ascending = [True,False])

models_result

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
21,new_values_log,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 50},0.79911,1972.90581,2408.717196
7,all_log,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 150},0.790839,125.193771,159.814824
28,all_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.781811,125.193771,159.814824
29,new_values,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.779057,168.839807,220.776706
30,new_values_log,iter1-normal,FL,RandomForest,{'model__n_estimators': 50},0.774959,8436.806452,17191.242285
14,new_values,iter2-cov,FL,RandomForest,{'model__n_estimators': 50},0.767749,8558.709677,13502.383622
31,all_pca,iter1-normal,FL,RandomForest,{'model__n_estimators': 150},0.402307,14906.688172,22780.574768
0,all_pca,iter2-cov,FL,LinearRegression,{},0.359127,41654.981228,61610.112726
22,new_values_log,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.881413,11931.292662,15234.031189
15,new_values,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.878223,12165.527847,15611.276165


In [14]:
models_result.to_excel('../data/final_data/models_results.xlsx', index = False)

In [15]:
indices_max_r2 = models_result.groupby(['State'])['R2'].idxmax()

bests_models = models_result.loc[indices_max_r2]
bests_models

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
21,new_values_log,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 50},0.79911,1972.90581,2408.717196
22,new_values_log,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.881413,11931.292662,15234.031189
9,all_log,iter2-cov,NC,LinearRegression,{},0.868628,10247.423473,13541.573294
3,all_pca,iter2-cov,NJ,LinearRegression,{},0.595462,31778.200052,35635.501147
18,new_values,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 100},0.898738,502.854126,604.001534
19,new_values,iter2-cov,SC,Lasso,{'model__alpha': 1.0},0.858107,7297.583055,9056.313126
27,new_values_log,iter2-cov,VA,Lasso,{'model__alpha': 1.0},0.916111,13800.867247,17486.793986


In [16]:
bests_models.to_excel('../data/final_data/bests_models_results.xlsx', index = False)

# Save data version for bests models

In [17]:
data_FL = data_new_values_log.loc[data_new_values_log['State'] == 'FL']
data_FL.to_csv('../data/final_data/data_FL.csv', index = False)
data_FL.shape

(61, 21)

In [19]:
data_GA = data_new_values_log.loc[data_new_values_log['State'] == 'GA']
data_GA.to_csv('../data/final_data/data_GA.csv', index = False)
data_GA.shape

(154, 21)

In [20]:
data_NC = data_log.loc[data_log['State'] == 'NC']
data_NC.to_csv('../data/final_data/data_NC.csv', index = False)
data_NC.shape

(99, 33)

In [21]:
data_NJ = data_pca.loc[data_pca['State'] == 'NJ']
data_NJ.to_csv('../data/final_data/data_NJ.csv', index = False)
data_NJ.shape

(20, 5)

In [25]:
#new_values = pd.read_csv('../data/process_data/new_values.csv')
data_NY = data_new_values.loc[data_new_values['State'] == 'NY']
data_NY.to_csv('../data/final_data/data_NY.csv', index = False)
data_NY.shape

(56, 21)

In [23]:
data_SC = data_new_values.loc[data_new_values['State'] == 'SC']
data_SC.to_csv('../data/final_data/data_SC.csv', index = False)
data_SC.shape

(45, 21)

In [24]:
data_VA = data_new_values_log.loc[data_new_values_log['State'] == 'VA']
data_VA.to_csv('../data/final_data/data_VA.csv', index = False)
data_VA.shape

(88, 21)