In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error



# Modeling: regresiones, modelos ML con datos cov

In [2]:
def modeling(data,models,version,iteracion):
    results_dict = {
    'Version_data': version,
    'Version_iteracion': iteracion,
    'State': [],
    'Modelo': [],
    'Parameters': [],
    'R2': [],
    'MAE': [],
    'RMSE': []
}
    states = data['State'].unique()

    for state in states:
        data_state = data[data['State'] == state]
        X = data_state.drop(['MedianPrice', 'State'], axis=1)
        y = data_state['MedianPrice'] 
        
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        for name, model, params in models:
            pipeline = Pipeline([
                ('scaler', StandardScaler()), 
                ('model', model)
            ])

            grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='r2')
            grid_search.fit(X_scaled, y)

            y_pred = grid_search.predict(X_scaled)
            mae = mean_absolute_error(y, y_pred)
            rmse = mean_squared_error(y, y_pred, squared=False)

            results_dict['State'].append(state)
            results_dict['Modelo'].append(name)
            results_dict['Parameters'].append(grid_search.best_params_)
            results_dict['R2'].append(grid_search.best_score_)
            results_dict['MAE'].append(mae)
            results_dict['RMSE'].append(rmse)

    results_df = pd.DataFrame(results_dict)
    best_models_df = results_df.loc[results_df.groupby('State')['R2'].idxmax()]

    return best_models_df

In [3]:
models = [
    ('SVR', SVR(), {'model__C': [0.1, 1, 10], 'model__gamma': [0.1, 0.01, 0.001]}),
    ('KNeighborsRegressor', KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7],'model__weights': ['uniform', 'distance']}),
    ('Lasso', Lasso(), {'model__alpha': [0.1, 0.5, 1.0]}),
    ('LinearRegression', LinearRegression(), {}),
    ('KNN', KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7]}),
    ('RandomForest', RandomForestRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('GradientBoosting', GradientBoostingRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('AdaBoost', AdaBoostRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('XGBoost', XGBRegressor(), {'model__n_estimators': [50, 100, 150]})
]

# 1. Models all pca

In [4]:
data_pca = pd.read_csv('../data/process_data/cov_all_pca.csv')

In [5]:
models_pca = modeling(data = data_pca,models = models,version = 'all_pca', iteracion='iter2-cov')
models_pca

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
5,all_pca,iter2-cov,FL,RandomForest,{'model__n_estimators': 100},0.40919,14708.048387,22543.734863
14,all_pca,iter2-cov,GA,RandomForest,{'model__n_estimators': 150},0.77902,6861.568376,9343.132818
20,all_pca,iter2-cov,NC,Lasso,{'model__alpha': 1.0},0.549864,24906.677634,33257.927037
30,all_pca,iter2-cov,NJ,LinearRegression,{},0.595462,31778.200052,35635.501147
42,all_pca,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 50},0.813786,5819.39792,7679.942552
53,all_pca,iter2-cov,SC,XGBoost,{'model__n_estimators': 100},0.701486,0.017188,0.024122
56,all_pca,iter2-cov,VA,Lasso,{'model__alpha': 1.0},0.830683,23217.920939,34028.015829


# 2. Models all log

In [6]:
data_log = pd.read_csv('../data/process_data/cov_all_log.csv')

In [7]:
models_log = modeling(data = data_log,models = models,version = 'all_log',iteracion='iter2-cov')
models_log

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
6,all_log,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 150},0.783872,125.193771,159.814824
12,all_log,iter2-cov,GA,LinearRegression,{},0.824765,13154.660034,16221.991812
21,all_log,iter2-cov,NC,LinearRegression,{},0.868628,10247.423473,13541.573294
34,all_log,iter2-cov,NJ,AdaBoost,{'model__n_estimators': 50},0.554114,4502.916667,7269.256276
42,all_log,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 50},0.86947,2271.366454,2823.442958
50,all_log,iter2-cov,SC,RandomForest,{'model__n_estimators': 150},0.816218,5966.355556,8828.182552
60,all_log,iter2-cov,VA,GradientBoosting,{'model__n_estimators': 50},0.854896,4293.007636,5515.274653


# 3. Models new values

In [8]:
data_new_values = pd.read_csv('../data/process_data/cov_new_values.csv')

In [9]:
models_new_values = modeling(data = data_new_values,models = models,version = 'new_values',iteracion='iter2-cov')
models_new_values

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
6,new_values,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 100},0.798669,468.969574,553.694517
11,new_values,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.878805,12267.590107,15930.705726
21,new_values,iter2-cov,NC,LinearRegression,{},0.911957,10052.628324,12590.826233
28,new_values,iter2-cov,NJ,KNeighborsRegressor,"{'model__n_neighbors': 3, 'model__weights': 'u...",0.452062,31431.666667,37522.367403
42,new_values,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 50},0.891087,2008.395984,2567.204918
47,new_values,iter2-cov,SC,Lasso,{'model__alpha': 1.0},0.835408,8479.319918,10491.318644
57,new_values,iter2-cov,VA,LinearRegression,{},0.921119,12934.661538,16856.425698


# 4. Models new values log

In [10]:
data_new_values_log = pd.read_csv('../data/process_data/cov_new_values_log.csv')

In [11]:
models_new_values_log = modeling(data = data_new_values_log,models = models,version = 'new_values_log',
iteracion='iter2-cov')
models_new_values_log

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
5,new_values_log,iter2-cov,FL,RandomForest,{'model__n_estimators': 100},0.826246,6985.360656,9883.787697
11,new_values_log,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.808714,16070.501895,20573.793801
20,new_values_log,iter2-cov,NC,Lasso,{'model__alpha': 1.0},0.843991,14464.243178,18002.906382
34,new_values_log,iter2-cov,NJ,AdaBoost,{'model__n_estimators': 100},0.455004,5754.222222,8846.489002
42,new_values_log,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 50},0.896933,2573.887605,3186.684345
50,new_values_log,iter2-cov,SC,RandomForest,{'model__n_estimators': 100},0.815648,6078.955556,8306.092745
55,new_values_log,iter2-cov,VA,KNeighborsRegressor,"{'model__n_neighbors': 5, 'model__weights': 'd...",0.766963,0.0,0.0


# 5. Get results

In [12]:
frames = [models_pca, models_log, models_new_values, models_new_values_log]
models_result = pd.concat(frames)

In [13]:
previous_models_result = pd.read_excel('../data/final_data/models_results.xlsx')
models_result = pd.concat([models_result, previous_models_result], ignore_index = True)

models_result = models_result.reset_index(drop=True).sort_values(by=['State','R2'], ascending = [True,False])

models_result

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
21,new_values_log,iter2-cov,FL,RandomForest,{'model__n_estimators': 100},0.826246,6985.360656,9883.787697
14,new_values,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 100},0.798669,468.969574,553.694517
28,new_values,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.79793,126.93948,149.012802
29,all_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.796257,125.193771,159.814824
30,new_values_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.792581,231.957811,281.239279
7,all_log,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 150},0.783872,125.193771,159.814824
0,all_pca,iter2-cov,FL,RandomForest,{'model__n_estimators': 100},0.40919,14708.048387,22543.734863
31,all_pca,iter1-normal,FL,RandomForest,{'model__n_estimators': 50},0.375347,15149.451613,24454.906091
15,new_values,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.878805,12267.590107,15930.705726
32,new_values,iter1-normal,GA,Lasso,{'model__alpha': 1.0},0.878805,12267.590107,15930.705726


In [14]:
models_result.to_excel('../data/final_data/models_results.xlsx', index = False)

In [15]:
indices_max_r2 = models_result.groupby(['State'])['R2'].idxmax()

bests_models = models_result.loc[indices_max_r2]
bests_models

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Parameters,R2,MAE,RMSE
21,new_values_log,iter2-cov,FL,RandomForest,{'model__n_estimators': 100},0.826246,6985.360656,9883.787697
15,new_values,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.878805,12267.590107,15930.705726
16,new_values,iter2-cov,NC,LinearRegression,{},0.911957,10052.628324,12590.826233
3,all_pca,iter2-cov,NJ,LinearRegression,{},0.595462,31778.200052,35635.501147
44,new_values,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 100},0.899687,370.935292,443.085844
19,new_values,iter2-cov,SC,Lasso,{'model__alpha': 1.0},0.835408,8479.319918,10491.318644
20,new_values,iter2-cov,VA,LinearRegression,{},0.921119,12934.661538,16856.425698


In [16]:
bests_models.to_excel('../data/final_data/bests_models_results.xlsx', index = False)

# Save data version for bests models

In [17]:
data_FL = data_new_values_log.loc[data_new_values_log['State'] == 'FL']
data_FL.to_csv('../data/final_data/data_FL.csv', index = False)
data_FL.shape

(61, 16)

In [18]:
data_GA = data_new_values.loc[data_new_values['State'] == 'GA']
data_GA.to_csv('../data/final_data/data_GA.csv', index = False)
data_GA.shape

(157, 19)

In [19]:
data_NC = data_new_values.loc[data_new_values['State'] == 'NC']
data_NC.to_csv('../data/final_data/data_NC.csv', index = False)
data_NC.shape

(99, 19)

In [20]:
data_NJ = data_pca.loc[data_pca['State'] == 'NJ']
data_NJ.to_csv('../data/final_data/data_NJ.csv', index = False)
data_NJ.shape

(20, 5)

In [21]:
new_values = pd.read_csv('../data/process_data/new_values.csv')
data_NY = new_values.loc[new_values['State'] == 'NY']
data_NY.to_csv('../data/final_data/data_NY.csv', index = False)
data_NY.shape

(56, 19)

In [22]:
data_SC = data_new_values.loc[data_new_values['State'] == 'SC']
data_SC.to_csv('../data/final_data/data_SC.csv', index = False)
data_SC.shape

(45, 19)

In [23]:
data_VA = data_new_values.loc[data_new_values['State'] == 'VA']
data_VA.to_csv('../data/final_data/data_VA.csv', index = False)
data_VA.shape

(87, 19)