In [1]:
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

import time
import ast
from collections import defaultdict
import numpy as np

# Modeling: Transfer learning en ML

In [2]:
models = {
    'Lasso': Lasso(),
    'LinearRegression': LinearRegression(),
    'KNN': KNeighborsRegressor(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'XGBoost': XGBRegressor()
}

In [3]:
bests_models = pd.read_excel('../data/final_data/bests_models_results.xlsx')
bests_models = bests_models.drop('Tiempo de Ejecución', axis = 1)
bests_models

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,parametros,r2
0,new_values_log,iter2-cov,FL,RandomForest,{'model__n_estimators': 50},0.837673
1,new_values,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.878851
2,new_values,iter2-cov,NC,LinearRegression,{},0.911957
3,all_log,iter2-cov,NJ,AdaBoost,{'model__n_estimators': 50},0.614304
4,new_values,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 150},0.906065
5,new_values,iter2-cov,SC,Lasso,{'model__alpha': 1.0},0.835556
6,new_values,iter2-cov,VA,LinearRegression,{},0.921119


In [4]:
mejores_modelos = bests_models[['State','Modelo','parametros','Version_data']].to_dict(orient='records')

modelos_dict = {}
for diccionario in mejores_modelos:
    modelos_dict[diccionario.get('State')] = {'Modelo': diccionario.get('Modelo'), 'parametros': diccionario.get('parametros'),
    'Version_data': diccionario.get('Version_data')}
modelos_dict

{'FL': {'Modelo': 'RandomForest',
  'parametros': "{'model__n_estimators': 50}",
  'Version_data': 'new_values_log'},
 'GA': {'Modelo': 'Lasso',
  'parametros': "{'model__alpha': 1.0}",
  'Version_data': 'new_values'},
 'NC': {'Modelo': 'LinearRegression',
  'parametros': '{}',
  'Version_data': 'new_values'},
 'NJ': {'Modelo': 'AdaBoost',
  'parametros': "{'model__n_estimators': 50}",
  'Version_data': 'all_log'},
 'NY': {'Modelo': 'GradientBoosting',
  'parametros': "{'model__n_estimators': 150}",
  'Version_data': 'new_values'},
 'SC': {'Modelo': 'Lasso',
  'parametros': "{'model__alpha': 1.0}",
  'Version_data': 'new_values'},
 'VA': {'Modelo': 'LinearRegression',
  'parametros': '{}',
  'Version_data': 'new_values'}}

# Aumented data OverSampling

In [5]:
results_dict = {
    'Version_data': [],
    'Version_iteracion': 'iter3-OverSampling',
    'State': [],
    'Modelo': [],
    'parametros': [],
    'r2': []
}
States = bests_models.State.unique()
directory = '../data/final_data/'

for state in States:
    results_dict['Version_data'].append(modelos_dict[state].get('Version_data'))
    results_dict['Modelo'].append(modelos_dict[state].get('Modelo'))
    results_dict['parametros'].append(modelos_dict[state].get('parametros'))
    results_dict['State'].append(state)

    filename = f"data_{state}.csv"
    filepath = os.path.join(directory, filename)
    if os.path.exists(filepath):
        data = pd.read_csv(filepath)
        X = data.drop(['MedianPrice', 'State'], axis=1)
        y = data['MedianPrice'] 

        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

        
        # Sobremuestreo utilizando Random Oversampling
        ros = RandomOverSampler()
        X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

        model_name = modelos_dict[state].get('Modelo')
        model = models.get(model_name)

        if model is not None:
            parametros = modelos_dict[state].get('parametros')

            if isinstance(model, type):
                model.set_params(**parametros)

            # Entrenar el modelo y hacer predicciones
            model.fit(X_train_ros, y_train_ros)
            y_pred = model.predict(X_test)

            # Calcular R2
            r2 = r2_score(y_test, y_pred)

            results_dict['r2'].append(r2)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [6]:
results_df = pd.DataFrame(results_dict)
results_df

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,parametros,r2
0,new_values_log,iter3-OverSampling,FL,RandomForest,{'model__n_estimators': 50},0.835886
1,new_values,iter3-OverSampling,GA,Lasso,{'model__alpha': 1.0},0.900148
2,new_values,iter3-OverSampling,NC,LinearRegression,{},0.917646
3,all_log,iter3-OverSampling,NJ,AdaBoost,{'model__n_estimators': 50},0.901798
4,new_values,iter3-OverSampling,NY,GradientBoosting,{'model__n_estimators': 150},0.940309
5,new_values,iter3-OverSampling,SC,Lasso,{'model__alpha': 1.0},0.845874
6,new_values,iter3-OverSampling,VA,LinearRegression,{},0.940935


In [7]:
models_df =  pd.concat([bests_models, results_df], axis=0).sort_values(by=['State','r2'], ascending = [True,False] )
models_df

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,parametros,r2
0,new_values_log,iter2-cov,FL,RandomForest,{'model__n_estimators': 50},0.837673
0,new_values_log,iter3-OverSampling,FL,RandomForest,{'model__n_estimators': 50},0.835886
1,new_values,iter3-OverSampling,GA,Lasso,{'model__alpha': 1.0},0.900148
1,new_values,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.878851
2,new_values,iter3-OverSampling,NC,LinearRegression,{},0.917646
2,new_values,iter2-cov,NC,LinearRegression,{},0.911957
3,all_log,iter3-OverSampling,NJ,AdaBoost,{'model__n_estimators': 50},0.901798
3,all_log,iter2-cov,NJ,AdaBoost,{'model__n_estimators': 50},0.614304
4,new_values,iter3-OverSampling,NY,GradientBoosting,{'model__n_estimators': 150},0.940309
4,new_values,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 150},0.906065


# Aumentando data distribucion

In [8]:
results_dict = {
    'Version_data': [],
    'Version_iteracion': 'iter3-UnderSampling',
    'State': [],
    'Modelo': [],
    'parametros': [],
    'r2': []
}
States = bests_models.State.unique()
directory = '../data/final_data/'

for state in States:
    results_dict['Version_data'].append(modelos_dict[state].get('Version_data'))
    results_dict['Modelo'].append(modelos_dict[state].get('Modelo'))
    results_dict['parametros'].append(modelos_dict[state].get('parametros'))
    results_dict['State'].append(state)

    filename = f"data_{state}.csv"
    filepath = os.path.join(directory, filename)
    if os.path.exists(filepath):
        data = pd.read_csv(filepath)
        X = data.drop(['MedianPrice', 'State'], axis=1)
        y = data['MedianPrice'] 

        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

        
        # Sobremuestreo utilizando Random Oversampling
        rus = RandomUnderSampler()
        X_train_rus, y_train_rus = ros.fit_resample(X_train, y_train)

        model_name = modelos_dict[state].get('Modelo')
        model = models.get(model_name)

        if model is not None:
            parametros = modelos_dict[state].get('parametros')

            if isinstance(model, type):
                model.set_params(**parametros)

            # Entrenar el modelo y hacer predicciones
            model.fit(X_train_rus, y_train_rus)
            y_pred = model.predict(X_test)

            # Calcular R2
            r2 = r2_score(y_test, y_pred)

            results_dict['r2'].append(r2)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [9]:
results_df = pd.DataFrame(results_dict)
results_df

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,parametros,r2
0,new_values_log,iter3-UnderSampling,FL,RandomForest,{'model__n_estimators': 50},0.846943
1,new_values,iter3-UnderSampling,GA,Lasso,{'model__alpha': 1.0},0.900126
2,new_values,iter3-UnderSampling,NC,LinearRegression,{},0.917646
3,all_log,iter3-UnderSampling,NJ,AdaBoost,{'model__n_estimators': 50},0.799709
4,new_values,iter3-UnderSampling,NY,GradientBoosting,{'model__n_estimators': 150},0.952982
5,new_values,iter3-UnderSampling,SC,Lasso,{'model__alpha': 1.0},0.845874
6,new_values,iter3-UnderSampling,VA,LinearRegression,{},0.940935


In [10]:
models_df =  pd.concat([models_df, results_df], axis=0).sort_values(by=['State','r2'], ascending = [True,False])
models_df

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,parametros,r2
0,new_values_log,iter3-UnderSampling,FL,RandomForest,{'model__n_estimators': 50},0.846943
0,new_values_log,iter2-cov,FL,RandomForest,{'model__n_estimators': 50},0.837673
0,new_values_log,iter3-OverSampling,FL,RandomForest,{'model__n_estimators': 50},0.835886
1,new_values,iter3-OverSampling,GA,Lasso,{'model__alpha': 1.0},0.900148
1,new_values,iter3-UnderSampling,GA,Lasso,{'model__alpha': 1.0},0.900126
1,new_values,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.878851
2,new_values,iter3-OverSampling,NC,LinearRegression,{},0.917646
2,new_values,iter3-UnderSampling,NC,LinearRegression,{},0.917646
2,new_values,iter2-cov,NC,LinearRegression,{},0.911957
3,all_log,iter3-OverSampling,NJ,AdaBoost,{'model__n_estimators': 50},0.901798


# Intentando TL 

No se puede, ya que los modelos de ML no son modelos que usan pesos de entrenamiento como las NL. Ademas puede no ser óptimo, ya que los datos de un estado pueden tener características únicas que el modelo haya aprendido a capturar durante el entrenamiento. Por lo tanto, es posible que el rendimiento de las predicciones en un estado diferente no sea tan bueno como en el estado para el cual se entrenó el modelo