In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import plotly.express as px
from sklearn.covariance import LedoitWolf, ShrunkCovariance, OAS, EmpiricalCovariance
from scipy.spatial.distance import mahalanobis

In [2]:
import plotly.io as pio
pio.renderers.default = "notebook"

# Funciones

In [3]:
covarianzas = {
    'cov_h': np.cov,
    'cov_i': lambda X: np.cov(X.T) + 10*np.eye(X.shape[1]),
    'cov_CLW': lambda X: LedoitWolf().fit(X).covariance_,
    'cov_shr': lambda X: ShrunkCovariance().fit(X).covariance_,
    'cov_oas': lambda X: OAS().fit(X).covariance_,
    'cov_epm': lambda X: EmpiricalCovariance().fit(X).covariance_
}

In [4]:
def apply_covariance_outliers(data, covarianzas):
    results_dict = {
        'Version_cov': [],
        'State': [],
        'num_condicion': [],
        'determinante': [],
        'score_modelo_test': [],
        'score_modelo_train': []
    }

    # Entrenamiento modelo con out_liers
    states = data['State'].unique()

    for state in states:
        data_state = data[data['State'] == state]
        X = data.drop(['MedianPrice', 'State'], axis=1)
        y = data['MedianPrice'] 

        x_train, x_test, y_train, y_test = train_test_split(X, y)
        model = GradientBoostingRegressor()
        model.fit(x_train, y_train)
        train_score = model.score(x_train, y_train)
        test_score = model.score(x_test, y_test)

        results_dict['State'].append(state)
        results_dict['Version_cov'].append('no_cov')
        results_dict['num_condicion'].append(0)
        results_dict['determinante'].append(0)
        results_dict['score_modelo_test'].append(test_score)
        results_dict['score_modelo_train'].append(train_score)

        for cov_name, cov_func in covarianzas.items():
            try:
                cov = cov_func(X)  # Calcular la covarianza dentro de la función
                # Calcular numero de condicion y determinante
                cond = np.linalg.cond(cov)
                det = np.linalg.det(cov)
                # Calcular distancia de mahalanobis 
                #mahalanobis_dis = mahalanobis(x=X, data=data_state.iloc[:,2:], cov=cov)
                inv_cov = np.linalg.inv(cov) 
                mahalanobis_dis = [mahalanobis(x, np.mean(X, axis=0), inv_cov) for x in X.values]  # Calcular la distancia de Mahalanobis para cada punto

                # Calcular el Rango Intercuartílico (IQR)
                q3 = np.percentile(mahalanobis_dis, 75)
                q1 = np.percentile(mahalanobis_dis, 25)
                iqr = q3 - q1
                limite_superior = q3 + 3 * iqr

                # Entrenar modelo con datos filtrados sin outliers
                x_filtered = X[mahalanobis_dis < limite_superior]
                y_filtered = y[mahalanobis_dis < limite_superior]

                x_train_filtered, x_test_filtered, y_train_filtered, y_test_filtered = train_test_split(x_filtered, y_filtered)
                model = GradientBoostingRegressor()
                model.fit(x_train_filtered, y_train_filtered)
                train_score_filtered = model.score(x_train_filtered, y_train_filtered)
                test_score_filtered = model.score(x_test_filtered, y_test_filtered)

                results_dict['State'].append(state)
                results_dict['Version_cov'].append(cov_name)
                results_dict['num_condicion'].append(cond)
                results_dict['determinante'].append(det)
                results_dict['score_modelo_test'].append(test_score_filtered)
                results_dict['score_modelo_train'].append(train_score_filtered)
            except ValueError:
                results_dict['State'].append(state)
                results_dict['Version_cov'].append(cov_name)
                results_dict['num_condicion'].append(0)  
                results_dict['determinante'].append(0)   
                results_dict['score_modelo_test'].append(0)
                results_dict['score_modelo_train'].append(0)

    return pd.DataFrame(results_dict)

In [5]:

def covariance_filter(data_state, best_cov_version, covarianzas):
    X = data_state.drop(['MedianPrice', 'State'], axis=1)
    
    if best_cov_version == 'no_cov':
        filtered_data = data_state.copy()
    else:
        cov = covarianzas[best_cov_version](X)
        cond = np.linalg.cond(cov)
        det = np.linalg.det(cov)

        # Calcular distancia de Mahalanobis
        inv_cov = np.linalg.inv(cov)
        mean_vector = np.mean(X, axis=0)
        mahalanobis_dis = [mahalanobis(x, mean_vector, inv_cov) for x in X.values]

        # Calcular el Rango Intercuartílico (IQR)
        q3 = np.percentile(mahalanobis_dis, 75)
        q1 = np.percentile(mahalanobis_dis, 25)
        iqr = q3 - q1
        limite_superior = q3 + 3 * iqr

        # Filtrar los datos basados en la distancia de Mahalanobis
        filtered_data = data_state[mahalanobis_dis < limite_superior]

    return filtered_data

In [6]:
def apply_best_covariance_method(data, best_cov, covarianzas):
    filtered_data = pd.DataFrame()

    for state, best_cov_version in best_cov[['State', 'Version_cov']].values:
        data_state = data[data['State'] == state]
        filtered_state_data = covariance_filter(data_state, best_cov_version, covarianzas)
        filtered_data = pd.concat([filtered_data, filtered_state_data])

    return filtered_data

# 1. Data all_pca

In [7]:
data_pca = pd.read_csv('../data/process_data/all_pca.csv')

In [8]:
data_pca.groupby('State').count()

Unnamed: 0_level_0,MedianPrice,pca1,pca2,pca3
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FL,62,62,62,62
GA,157,157,157,157
NC,99,99,99,99
NJ,20,20,20,20
NY,56,56,56,56
SC,45,45,45,45
VA,89,89,89,89


In [9]:
cov_pca = apply_covariance_outliers(data_pca,covarianzas)
best_pca = cov_pca.loc[cov_pca.groupby('State')['score_modelo_test'].idxmax()]
best_pca

Unnamed: 0,Version_cov,State,num_condicion,determinante,score_modelo_test,score_modelo_train
0,no_cov,FL,0.0,0.0,0.809198,0.918112
47,cov_oas,GA,3.975721,0.003635,0.781703,0.921094
24,cov_CLW,NC,3.884405,0.003693,0.764368,0.919859
10,cov_CLW,NJ,3.884405,0.003693,0.768869,0.922184
20,cov_epm,NY,4.131239,0.00354,0.761877,0.917173
28,no_cov,SC,0.0,0.0,0.794307,0.92386
37,cov_i,VA,1.0252,1055.544673,0.772459,0.924667


In [10]:
cov_data_pca = apply_best_covariance_method(data_pca,best_pca,covarianzas)
cov_data_pca.groupby('State').count()

Unnamed: 0_level_0,MedianPrice,pca1,pca2,pca3
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FL,62,62,62,62
GA,156,156,156,156
NC,97,97,97,97
NJ,20,20,20,20
NY,55,55,55,55
SC,45,45,45,45
VA,87,87,87,87


In [11]:
cov_data_pca.to_csv('../data/process_data/cov_all_pca.csv', index = False)

# 2. Data all_log

In [13]:
data_log = pd.read_csv('../data/process_data/all_log.csv')

In [14]:
cov_log = apply_covariance_outliers(data_log,covarianzas)
best_log = cov_log.loc[cov_log.groupby('State')['score_modelo_test'].idxmax()]
best_log


invalid value encountered in sqrt


invalid value encountered in sqrt


invalid value encountered in sqrt


invalid value encountered in sqrt


invalid value encountered in sqrt


invalid value encountered in sqrt


invalid value encountered in sqrt



Unnamed: 0,Version_cov,State,num_condicion,determinante,score_modelo_test,score_modelo_train
3,cov_CLW,FL,4816.24902,8.115290000000001e-54,0.923722,0.988156
44,cov_i,GA,2.472094,3.6196880000000003e+31,0.937103,0.987506
24,cov_CLW,NC,4816.24902,8.115290000000001e-54,0.926606,0.986754
11,cov_shr,NJ,218.760963,2.155466e-30,0.907863,0.985913
18,cov_shr,NY,218.760963,2.155466e-30,0.924819,0.984563
31,cov_CLW,SC,4816.24902,8.115290000000001e-54,0.931738,0.985923
35,no_cov,VA,0.0,0.0,0.902921,0.986324


In [17]:
cov_data_log = apply_best_covariance_method(data_log,best_log,covarianzas)
cov_data_log.groupby('State').count()

Unnamed: 0_level_0,HousingUnits,MedianRooms,population,MedianAge,VacantHousingUnits,OwnerOccupied,RenterOccupied,Households,MedianHouseholdIncome,MedianFamilyIncome,...,Humidity,LowTemp,PrecipDays,Precipitation,Pressure,UVIndex,WindSpeed,Desempleo,Ti,MedianPrice
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FL,62,62,62,62,62,62,62,62,62,62,...,62,62,62,62,62,62,62,62,62,62
GA,157,157,157,157,157,157,157,157,157,157,...,157,157,157,157,157,157,157,157,157,157
NC,99,99,99,99,99,99,99,99,99,99,...,99,99,99,99,99,99,99,99,99,99
NJ,20,20,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20
NY,55,55,55,55,55,55,55,55,55,55,...,55,55,55,55,55,55,55,55,55,55
SC,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,45,45,45
VA,88,88,88,88,88,88,88,88,88,88,...,88,88,88,88,88,88,88,88,88,88


In [18]:
cov_data_log.to_csv('../data/process_data/cov_all_log.csv', index = False)

# 3. Data new values

In [19]:
data_new_values = pd.read_csv('../data/process_data/new_values.csv')

In [20]:
cov_new_values = apply_covariance_outliers(data_new_values,covarianzas)
best_new_values = cov_new_values.loc[cov_new_values.groupby('State')['score_modelo_test'].idxmax()]
best_new_values

Unnamed: 0,Version_cov,State,num_condicion,determinante,score_modelo_test,score_modelo_train
6,cov_epm,FL,1.0779e+22,2.945267e+37,0.929012,0.982301
42,no_cov,GA,0.0,0.0,0.930105,0.982424
27,cov_epm,NC,1.0779e+22,2.945267e+37,0.911763,0.984441
7,no_cov,NJ,0.0,0.0,0.914241,0.9842
16,cov_i,NY,84183030000000.0,5.03812e+59,0.883506,0.985491
31,cov_CLW,SC,1.0,6.269377e+232,0.917856,0.979926
41,cov_epm,VA,1.0779e+22,2.945267e+37,0.923825,0.983979


In [21]:
cov_data_new_values = apply_best_covariance_method(data_new_values,best_new_values,covarianzas)
cov_data_new_values.groupby('State').count()

Unnamed: 0_level_0,MedianRooms,MedianAge,Households,PerCapitaIncome,MedianNonfamilyIncome,LandAreakm2,Latitude,Longitude,Areakm2,WaterAreakm2,CloudCover,HoursofSun,Precipitation,Desempleo,Ti,pca1,pca2,MedianPrice
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
FL,62,62,62,62,62,62,62,62,62,62,62,62,62,62,62,62,62,62
GA,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157
NC,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99
NJ,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
NY,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55
SC,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45
VA,87,87,87,87,87,87,87,87,87,87,87,87,87,87,87,87,87,87


In [22]:
cov_data_new_values.to_csv('../data/process_data/cov_new_values.csv', index = False)