In [1]:
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from imblearn.over_sampling import SMOTE

import time
import ast
from collections import defaultdict

# Modeling: Transfer learning en ML

In [6]:
models = {
    'Lasso': Lasso(),
    'LinearRegression': LinearRegression(),
    'KNN': KNeighborsRegressor(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'XGBoost': XGBRegressor()
}

In [3]:
bests_models = pd.read_excel('../data/final_data/bests_models_results.xlsx')
bests_models

Unnamed: 0,Version_data,Version_iteracion,State,Modelo,parametros,r2,Tiempo de Ejecución
0,new_values_log,iter2-cov,FL,RandomForest,{'model__n_estimators': 50},0.837673,2.591476
1,new_values,iter2-cov,GA,Lasso,{'model__alpha': 1.0},0.878851,0.133511
2,new_values,iter2-cov,NC,LinearRegression,{},0.911957,0.029054
3,all_log,iter2-cov,NJ,AdaBoost,{'model__n_estimators': 50},0.614304,1.878231
4,new_values,iter1-normal,NY,GradientBoosting,{'model__n_estimators': 150},0.906065,0.490726
5,new_values,iter2-cov,SC,Lasso,{'model__alpha': 1.0},0.835556,0.104108
6,new_values,iter2-cov,VA,LinearRegression,{},0.921119,0.027562


In [4]:
mejores_modelos = bests_models[['State','Modelo','parametros','Version_data']].to_dict(orient='records')

modelos_dict = {}
for diccionario in mejores_modelos:
    modelos_dict[diccionario.get('State')] = {'Modelo': diccionario.get('Modelo'), 'parametros': diccionario.get('parametros'),
    'Version_data': diccionario.get('Version_data')}
modelos_dict

{'FL': {'Modelo': 'RandomForest',
  'parametros': "{'model__n_estimators': 50}",
  'Version_data': 'new_values_log'},
 'GA': {'Modelo': 'Lasso',
  'parametros': "{'model__alpha': 1.0}",
  'Version_data': 'new_values'},
 'NC': {'Modelo': 'LinearRegression',
  'parametros': '{}',
  'Version_data': 'new_values'},
 'NJ': {'Modelo': 'AdaBoost',
  'parametros': "{'model__n_estimators': 50}",
  'Version_data': 'all_log'},
 'NY': {'Modelo': 'GradientBoosting',
  'parametros': "{'model__n_estimators': 150}",
  'Version_data': 'new_values'},
 'SC': {'Modelo': 'Lasso',
  'parametros': "{'model__alpha': 1.0}",
  'Version_data': 'new_values'},
 'VA': {'Modelo': 'LinearRegression',
  'parametros': '{}',
  'Version_data': 'new_values'}}

# Aumented data Smote

In [9]:
results_dict = {
    'Version_data': [],
    'Version_iteracion': 'iter3-smote',
    'State': [],
    'Modelo': [],
    'parametros': [],
    'r2': []
}
States = bests_models.State.unique()
directory = '../data/final_data/'

for state in States:
    results_dict['Version_data'].append(modelos_dict[state].get('Version_data'))
    results_dict['Modelo'].append(modelos_dict[state].get('Modelo'))
    results_dict['parametros'].append(modelos_dict[state].get('parametros'))

    filename = f"data_{state}.csv"
    filepath = os.path.join(directory, filename)
    if os.path.exists(filepath):
        data = pd.read_csv(filepath)
        X = data.drop(['MedianPrice', 'State'], axis=1)
        y = data['MedianPrice'] 

        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        
        smote = SMOTE()
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

        model_name = modelos_dict[state].get('Modelo')
        model = models.get(model_name)

        if model is not None:
            parametros = modelos_dict[state].get('parametros')

            if isinstance(model, type):
                model.set_params(**parametros)

            if len(X_train_smote) < 2:  # Si el conjunto de datos de entrenamiento es muy pequeño
                r2 = 0  # Establece R2 en 0
            else:
                # Entrenar el modelo y hacer predicciones
                model.fit(X_train_smote, y_train_smote)
                y_pred = model.predict(X_test)

                # Calcular R2
                r2 = r2_score(y_test, y_pred)

            results_dict['r2'].append(r2)

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 1, n_neighbors = 6

In [10]:
len(X_train)

125

In [12]:
results_dict

{'Version_data': ['new_values_log', 'new_values'],
 'Version_iteracion': 'iter3-smote',
 'State': [],
 'Modelo': ['RandomForest', 'Lasso'],
 'parametros': ["{'model__n_estimators': 50}", "{'model__alpha': 1.0}"],
 'r2': [0.8644978822092702]}