In [18]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import time
import ast
from collections import defaultdict

# Modeling: Transfer learning en ML

In [14]:
bests_models = pd.read_excel('../data/final_data/bests_models_results.xlsx')
bests_models.head()

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
0,new_values_log,iter2-cov,FL,RandomForest,{'model__n_estimators': 100},0.826589,1.463412
1,all_log,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 150},0.797439,0.655855
2,all_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.787347,0.637175
3,new_values_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.785308,0.471472
4,new_values,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 50},0.779604,0.483782


In [15]:
indices_max_r2 = bests_models.groupby(['State'])['Mejor Puntuación (r2)'].idxmax()

bests_models = bests_models.loc[indices_max_r2].rename(columns={'Mejores Parámetros': 'parametros', 'Mejor Puntuación (r2)': 'r2'})
bests_models

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,parametros,r2,Tiempo de Ejecución
0,new_values_log,iter2-cov,FL,RandomForest,{'model__n_estimators': 100},0.826589,1.463412
8,new_values,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.878851,0.108074
16,new_values,iter2-cov,NC,LinearRegression,{},0.911957,0.022385
24,all_pca,iter2-cov,NJ,Lasso,{'model__alpha': 1.0},0.595469,0.048842
32,new_values,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 150},0.906065,0.490726
40,new_values,iter2-cov,SC,Lasso,{'model__alpha': 1.0},0.835556,0.096264
48,new_values,iter2-cov,VA,LinearRegression,{},0.921119,0.022802


In [16]:
mejores_modelos = bests_models[['State','Modelo','parametros']].to_dict(orient='records')
mejores_modelos


[{'State': 'FL',
  'Modelo': 'RandomForest',
  'parametros': "{'model__n_estimators': 100}"},
 {'State': 'GA', 'Modelo': 'Lasso', 'parametros': "{'model__alpha': 1.0}"},
 {'State': 'NC', 'Modelo': 'LinearRegression', 'parametros': '{}'},
 {'State': 'NJ', 'Modelo': 'Lasso', 'parametros': "{'model__alpha': 1.0}"},
 {'State': 'NY',
  'Modelo': 'GradientBoosting',
  'parametros': "{'model__n_estimators': 150}"},
 {'State': 'SC', 'Modelo': 'Lasso', 'parametros': "{'model__alpha': 1.0}"},
 {'State': 'VA', 'Modelo': 'LinearRegression', 'parametros': '{}'}]

In [25]:
data_FL = pd.read_csv('../data/process_data/cov_new_values_log.csv')
data_FL = data_FL.loc[data_FL['State'] == 'FL']
data_FL.shape

(61, 16)

In [27]:
data_GA = pd.read_csv('../data/process_data/cov_new_values.csv')
data_GA = data_GA.loc[data_GA['State'] == 'GA']
data_GA.shape

(157, 19)

In [29]:
data_NC = pd.read_csv('../data/process_data/cov_new_values.csv')
data_NC = data_NC.loc[data_NC['State'] == 'NC']
data_NC.shape

(99, 19)

In [30]:
data_NJ = pd.read_csv('../data/process_data/cov_all_pca.csv')
data_NJ = data_NJ.loc[data_NJ['State'] == 'NJ']
data_NJ.shape

(20, 5)

In [31]:
data_NY = pd.read_csv('../data/process_data/new_values.csv')
data_NY = data_NY.loc[data_NY['State'] == 'NY']
data_NY.shape

(56, 19)

In [32]:
data_SC = pd.read_csv('../data/process_data/cov_new_values.csv')
data_SC = data_SC.loc[data_SC['State'] == 'SC']
data_SC.shape

(45, 19)

In [None]:
data_SC = pd.read_csv('../data/process_data/cov_new_values.csv')
data_SC = data_SC.loc[data_SC['State'] == 'SC']
data_SC.shape

In [22]:
modelos_dict = {}
for diccionario in mejores_modelos:
    modelos_dict[diccionario.get('State')] = {'Modelo': diccionario.get('Modelo'), 'parametros': diccionario.get('parametros')}
modelos_dict

{'FL': {'Modelo': 'RandomForest',
  'parametros': "{'model__n_estimators': 100}"},
 'GA': {'Modelo': 'Lasso', 'parametros': "{'model__alpha': 1.0}"},
 'NC': {'Modelo': 'LinearRegression', 'parametros': '{}'},
 'NJ': {'Modelo': 'Lasso', 'parametros': "{'model__alpha': 1.0}"},
 'NY': {'Modelo': 'GradientBoosting',
  'parametros': "{'model__n_estimators': 150}"},
 'SC': {'Modelo': 'Lasso', 'parametros': "{'model__alpha': 1.0}"},
 'VA': {'Modelo': 'LinearRegression', 'parametros': '{}'}}

In [23]:
modelos_dict['FL']

{'Modelo': 'RandomForest', 'parametros': "{'model__n_estimators': 100}"}

In [2]:
def modeling(data,models,version,iteracion):
    results_dict = {
    'Version_data': version,
    'Version_iteracion': iteracion,
    'State': [],
    'Modelo': [],
    'parametros': [],
    'r2': []
}
    states = data['State'].unique()

    for state in states:
        data_state = data[data['State'] == state]
        X = data_state.drop(['MedianPrice', 'State'], axis=1)
        y = data_state['MedianPrice'] 


            results_dict['State'].append(state)
            results_dict['Modelo'].append(name)
            results_dict['Mejores Parámetros'].append(grid_search.best_params_)
            results_dict['Mejor Puntuación (r2)'].append(grid_search.best_score_)
            results_dict['Tiempo de Ejecución'].append(elapsed_time)
            
    results_df = pd.DataFrame(results_dict)
    best_models_df = results_df.loc[results_df.groupby('State')['Mejor Puntuación (r2)'].idxmax()]

    return best_models_df

In [3]:
models = [
    ('Lasso', Lasso(), {'model__alpha': [0.1, 0.5, 1.0]}),
    ('LinearRegression', LinearRegression(), {}),
    ('KNN', KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7]}),
    ('RandomForest', RandomForestRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('GradientBoosting', GradientBoostingRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('AdaBoost', AdaBoostRegressor(), {'model__n_estimators': [50, 100, 150]}),
    ('XGBoost', XGBRegressor(), {'model__n_estimators': [50, 100, 150]})
]

# 1. Models all pca

In [4]:
data_pca = pd.read_csv('../data/process_data/cov_all_pca.csv')

In [5]:
models_pca = modeling(data = data_pca,models = models,version = 'all_pca', iteracion='iter2-cov')
models_pca

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
3,all_pca,iter2-cov,FL,RandomForest,{'model__n_estimators': 100},0.384206,1.354549
10,all_pca,iter2-cov,GA,RandomForest,{'model__n_estimators': 50},0.779118,1.464776
14,all_pca,iter2-cov,NC,Lasso,{'model__alpha': 1.0},0.549884,0.049882
21,all_pca,iter2-cov,NJ,Lasso,{'model__alpha': 1.0},0.595469,0.048842
32,all_pca,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 50},0.807306,0.318017
41,all_pca,iter2-cov,SC,XGBoost,{'model__n_estimators': 100},0.701486,0.571107
42,all_pca,iter2-cov,VA,Lasso,{'model__alpha': 1.0},0.830697,0.049115


# 2. Models all log

In [6]:
data_log = pd.read_csv('../data/process_data/cov_all_log.csv')

In [7]:
models_log = modeling(data = data_log,models = models,version = 'all_log',iteracion='iter2-cov')
models_log

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
4,all_log,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 150},0.797439,0.655855
8,all_log,iter2-cov,GA,LinearRegression,{},0.825071,0.073858
15,all_log,iter2-cov,NC,LinearRegression,{},0.871666,0.024883
21,all_log,iter2-cov,NJ,Lasso,{'model__alpha': 0.1},0.419303,0.093221
32,all_log,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 50},0.881164,0.545972
38,all_log,iter2-cov,SC,RandomForest,{'model__n_estimators': 150},0.804975,1.618101
46,all_log,iter2-cov,VA,GradientBoosting,{'model__n_estimators': 100},0.873708,0.700798


# 3. Models new values

In [8]:
data_new_values = pd.read_csv('../data/process_data/cov_new_values.csv')

In [9]:
models_new_values = modeling(data = data_new_values,models = models,version = 'new_values',iteracion='iter2-cov')
models_new_values

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
4,new_values,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 50},0.779604,0.483782
7,new_values,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.878851,0.108074
15,new_values,iter2-cov,NC,LinearRegression,{},0.911957,0.022385
24,new_values,iter2-cov,NJ,RandomForest,{'model__n_estimators': 100},0.320077,1.398338
32,new_values,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 50},0.892287,0.453113
35,new_values,iter2-cov,SC,Lasso,{'model__alpha': 1.0},0.835556,0.096264
43,new_values,iter2-cov,VA,LinearRegression,{},0.921119,0.022802


# 4. Models new values log

In [10]:
data_new_values_log = pd.read_csv('../data/process_data/cov_new_values_log.csv')

In [11]:
models_new_values_log = modeling(data = data_new_values_log,models = models,version = 'new_values_log',
iteracion='iter2-cov')
models_new_values_log

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
3,new_values_log,iter2-cov,FL,RandomForest,{'model__n_estimators': 100},0.826589,1.463412
7,new_values_log,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.809128,0.096859
14,new_values_log,iter2-cov,NC,Lasso,{'model__alpha': 1.0},0.844879,0.061027
26,new_values_log,iter2-cov,NJ,AdaBoost,{'model__n_estimators': 50},0.427998,1.224141
32,new_values_log,iter2-cov,NY,GradientBoosting,{'model__n_estimators': 50},0.898185,0.421525
38,new_values_log,iter2-cov,SC,RandomForest,{'model__n_estimators': 150},0.823417,1.457818
45,new_values_log,iter2-cov,VA,RandomForest,{'model__n_estimators': 150},0.761259,1.61233


# 5. Get results

In [12]:
frames = [models_pca, models_log, models_new_values, models_new_values_log]
models_result = pd.concat(frames)

In [13]:
previous_models_result = pd.read_excel('../data/final_data/bests_models_results.xlsx')
models_result = pd.concat([models_result, previous_models_result], ignore_index = True)

models_result = models_result.reset_index(drop=True).sort_values(by=['State','Mejor Puntuación (r2)'], ascending = [True,False])

models_result

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,Mejores Parámetros,Mejor Puntuación (r2),Tiempo de Ejecución
21,new_values_log,iter2-cov,FL,RandomForest,{'model__n_estimators': 100},0.826589,1.463412
7,all_log,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 150},0.797439,0.655855
28,all_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.787347,0.637175
29,new_values_log,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.785308,0.471472
14,new_values,iter2-cov,FL,GradientBoosting,{'model__n_estimators': 50},0.779604,0.483782
30,new_values,iter1-normal,FL,GradientBoosting,{'model__n_estimators': 150},0.773678,0.542457
31,all_pca,iter1-normal,FL,RandomForest,{'model__n_estimators': 150},0.384487,1.400077
0,all_pca,iter2-cov,FL,RandomForest,{'model__n_estimators': 100},0.384206,1.354549
15,new_values,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.878851,0.108074
32,new_values,iter1-normal,GA,Lasso,{'model__alpha': 1.0},0.878851,0.11295


In [14]:
models_result.to_excel('../data/final_data/bests_models_results.xlsx', index = False)