In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import plotly.express as px
from sklearn.covariance import LedoitWolf, ShrunkCovariance, OAS, EmpiricalCovariance
from scipy.spatial.distance import mahalanobis



In [2]:
import plotly.io as pio
pio.renderers.default = "notebook"

# Funciones

In [3]:
covarianzas = {
    'cov_h': np.cov,
    'cov_i': lambda X: np.cov(X.T) + 10*np.eye(X.shape[1]),
    'cov_CLW': lambda X: LedoitWolf().fit(X).covariance_,
    'cov_shr': lambda X: ShrunkCovariance().fit(X).covariance_,
    'cov_oas': lambda X: OAS().fit(X).covariance_,
    'cov_epm': lambda X: EmpiricalCovariance().fit(X).covariance_
}

In [4]:
def apply_covariance_outliers(data, covarianzas):
    results_dict = {
        'Version_cov': [],
        'State': [],
        'num_condicion': [],
        'determinante': [],
        'score_modelo_test': [],
        'score_modelo_train': []
    }

    # Entrenamiento modelo con out_liers
    states = data['State'].unique()

    for state in states:
        data_state = data[data['State'] == state]
        X = data.drop(['MedianPrice', 'State'], axis=1)
        y = data['MedianPrice'] 

        x_train, x_test, y_train, y_test = train_test_split(X, y)
        model = GradientBoostingRegressor()
        model.fit(x_train, y_train)
        train_score = model.score(x_train, y_train)
        test_score = model.score(x_test, y_test)

        results_dict['State'].append(state)
        results_dict['Version_cov'].append('no_cov')
        results_dict['num_condicion'].append(0)
        results_dict['determinante'].append(0)
        results_dict['score_modelo_test'].append(test_score)
        results_dict['score_modelo_train'].append(train_score)

        for cov_name, cov_func in covarianzas.items():
            try:
                cov = cov_func(X)  # Calcular la covarianza dentro de la función
                # Calcular numero de condicion y determinante
                cond = np.linalg.cond(cov)
                det = np.linalg.det(cov)
                # Calcular distancia de mahalanobis 
                #mahalanobis_dis = mahalanobis(x=X, data=data_state.iloc[:,2:], cov=cov)
                inv_cov = np.linalg.inv(cov) 
                mahalanobis_dis = [mahalanobis(x, np.mean(X, axis=0), inv_cov) for x in X.values]  # Calcular la distancia de Mahalanobis para cada punto

                # Calcular el Rango Intercuartílico (IQR)
                q3 = np.percentile(mahalanobis_dis, 75)
                q1 = np.percentile(mahalanobis_dis, 25)
                iqr = q3 - q1
                limite_superior = q3 + 3 * iqr

                # Entrenar modelo con datos filtrados sin outliers
                x_filtered = X[mahalanobis_dis < limite_superior]
                y_filtered = y[mahalanobis_dis < limite_superior]

                x_train_filtered, x_test_filtered, y_train_filtered, y_test_filtered = train_test_split(x_filtered, y_filtered)
                model = GradientBoostingRegressor()
                model.fit(x_train_filtered, y_train_filtered)
                train_score_filtered = model.score(x_train_filtered, y_train_filtered)
                test_score_filtered = model.score(x_test_filtered, y_test_filtered)

                results_dict['State'].append(state)
                results_dict['Version_cov'].append(cov_name)
                results_dict['num_condicion'].append(cond)
                results_dict['determinante'].append(det)
                results_dict['score_modelo_test'].append(test_score_filtered)
                results_dict['score_modelo_train'].append(train_score_filtered)
            except ValueError:
                results_dict['State'].append(state)
                results_dict['Version_cov'].append(cov_name)
                results_dict['num_condicion'].append(0)  
                results_dict['determinante'].append(0)   
                results_dict['score_modelo_test'].append(0)
                results_dict['score_modelo_train'].append(0)

    return pd.DataFrame(results_dict)

In [5]:

def covariance_filter(data_state, best_cov_version, covarianzas):
    X = data_state.drop(['MedianPrice', 'State'], axis=1)
    
    if best_cov_version == 'no_cov':
        filtered_data = data_state.copy()
    else:
        cov = covarianzas[best_cov_version](X)
        cond = np.linalg.cond(cov)
        det = np.linalg.det(cov)

        # Calcular distancia de Mahalanobis
        inv_cov = np.linalg.inv(cov)
        mean_vector = np.mean(X, axis=0)
        mahalanobis_dis = [mahalanobis(x, mean_vector, inv_cov) for x in X.values]

        # Calcular el Rango Intercuartílico (IQR)
        q3 = np.percentile(mahalanobis_dis, 75)
        q1 = np.percentile(mahalanobis_dis, 25)
        iqr = q3 - q1
        limite_superior = q3 + 3 * iqr

        # Filtrar los datos basados en la distancia de Mahalanobis
        filtered_data = data_state[mahalanobis_dis < limite_superior]

    return filtered_data

In [6]:
def apply_best_covariance_method(data, best_cov, covarianzas):
    filtered_data = pd.DataFrame()

    for state, best_cov_version in best_cov[['State', 'Version_cov']].values:
        data_state = data[data['State'] == state]
        filtered_state_data = covariance_filter(data_state, best_cov_version, covarianzas)
        filtered_data = pd.concat([filtered_data, filtered_state_data])

    return filtered_data

# 1. Data all_pca

In [7]:
data_pca = pd.read_csv('../data/process_data/all_pca.csv')

In [8]:
data_pca.groupby('State').count()

Unnamed: 0_level_0,MedianPrice,pca1,pca2,pca3
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FL,62,62,62,62
GA,157,157,157,157
NC,99,99,99,99
NJ,20,20,20,20
NY,56,56,56,56
SC,45,45,45,45
VA,89,89,89,89


In [9]:
cov_pca = apply_covariance_outliers(data_pca,covarianzas)
best_pca = cov_pca.loc[cov_pca.groupby('State')['score_modelo_test'].idxmax()]
best_pca

Unnamed: 0,Version_cov,State,num_condicion,determinante,score_modelo_test,score_modelo_train
3,cov_CLW,FL,3.884405,0.003693,0.784461,0.911401
42,no_cov,GA,0.0,0.0,0.759433,0.924865
21,no_cov,NC,0.0,0.0,0.755696,0.921872
11,cov_shr,NJ,3.505354,0.00395,0.822359,0.915314
19,cov_oas,NY,3.975721,0.003635,0.799892,0.916956
30,cov_i,SC,1.0252,1055.544673,0.767136,0.920743
35,no_cov,VA,0.0,0.0,0.802613,0.918818


In [10]:
cov_data_pca = apply_best_covariance_method(data_pca,best_pca,covarianzas)
cov_data_pca.groupby('State').count()

Unnamed: 0_level_0,MedianPrice,pca1,pca2,pca3
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FL,61,61,61,61
GA,157,157,157,157
NC,99,99,99,99
NJ,20,20,20,20
NY,54,54,54,54
SC,45,45,45,45
VA,89,89,89,89


In [11]:
cov_data_pca.to_csv('../data/process_data/cov_all_pca.csv', index = False)

# 2. Data all_log

In [12]:
data_log = pd.read_csv('../data/process_data/all_log.csv')

In [13]:
cov_log = apply_covariance_outliers(data_log,covarianzas)
best_log = cov_log.loc[cov_log.groupby('State')['score_modelo_test'].idxmax()]
best_log


invalid value encountered in sqrt


invalid value encountered in sqrt


invalid value encountered in sqrt


invalid value encountered in sqrt


invalid value encountered in sqrt


invalid value encountered in sqrt


invalid value encountered in sqrt



Unnamed: 0,Version_cov,State,num_condicion,determinante,score_modelo_test,score_modelo_train
5,cov_oas,FL,4664.438402,1.225498e-53,0.920784,0.985352
47,cov_oas,GA,4664.438402,1.225498e-53,0.908951,0.986813
26,cov_oas,NC,4664.438402,1.225498e-53,0.926387,0.98522
10,cov_CLW,NJ,4816.24902,8.115290000000001e-54,0.915757,0.98547
17,cov_CLW,NY,4816.24902,8.115290000000001e-54,0.906497,0.987103
33,cov_oas,SC,4664.438402,1.225498e-53,0.932161,0.984373
38,cov_CLW,VA,4816.24902,8.115290000000001e-54,0.927633,0.985338


In [14]:
cov_data_log = apply_best_covariance_method(data_log,best_log,covarianzas)
cov_data_log.groupby('State').count()

Unnamed: 0_level_0,HousingUnits,MedianRooms,population,MedianAge,VacantHousingUnits,OwnerOccupied,RenterOccupied,Households,MedianHouseholdIncome,MedianFamilyIncome,...,Humidity,LowTemp,PrecipDays,Precipitation,Pressure,UVIndex,WindSpeed,Desempleo,Ti,MedianPrice
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FL,62,62,62,62,62,62,62,62,62,62,...,62,62,62,62,62,62,62,62,62,62
GA,156,156,156,156,156,156,156,156,156,156,...,156,156,156,156,156,156,156,156,156,156
NC,99,99,99,99,99,99,99,99,99,99,...,99,99,99,99,99,99,99,99,99,99
NJ,20,20,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20
NY,55,55,55,55,55,55,55,55,55,55,...,55,55,55,55,55,55,55,55,55,55
SC,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,45,45,45
VA,87,87,87,87,87,87,87,87,87,87,...,87,87,87,87,87,87,87,87,87,87


In [15]:
cov_data_log.to_csv('../data/process_data/cov_all_log.csv', index = False)

# 3. Data new values

In [16]:
data_new_values = pd.read_csv('../data/process_data/new_values.csv')

In [17]:
cov_new_values = apply_covariance_outliers(data_new_values,covarianzas)
best_new_values = cov_new_values.loc[cov_new_values.groupby('State')['score_modelo_test'].idxmax()]
best_new_values

Unnamed: 0,Version_cov,State,num_condicion,determinante,score_modelo_test,score_modelo_train
0,no_cov,FL,0.0,0.0,0.940577,0.987654
42,no_cov,GA,0.0,0.0,0.931909,0.986792
23,cov_i,NC,84183030000000.0,2.023926e+74,0.93505,0.98676
9,cov_i,NJ,84183030000000.0,2.023926e+74,0.948935,0.986418
14,no_cov,NY,0.0,0.0,0.933387,0.987476
34,cov_epm,SC,1.067292e+22,5.5910329999999994e+51,0.943869,0.985362
37,cov_i,VA,84183030000000.0,2.023926e+74,0.924025,0.985884


In [18]:
cov_data_new_values = apply_best_covariance_method(data_new_values,best_new_values,covarianzas)
cov_data_new_values.groupby('State').count()

Unnamed: 0_level_0,MedianRooms,MedianAge,Households,MedianHouseholdIncome,MedianFamilyIncome,PerCapitaIncome,MedianNonfamilyIncome,LandAreakm2,Latitude,Areakm2,WaterAreakm2,CloudCover,HoursofSun,Precipitation,WindSpeed,Desempleo,Ti,pca1,pca2,MedianPrice
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
FL,62,62,62,62,62,62,62,62,62,62,62,62,62,62,62,62,62,62,62,62
GA,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157
NC,96,96,96,96,96,96,96,96,96,96,96,96,96,96,96,96,96,96,96,96
NJ,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
NY,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56
SC,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45
VA,87,87,87,87,87,87,87,87,87,87,87,87,87,87,87,87,87,87,87,87


In [19]:
cov_data_new_values.to_csv('../data/process_data/cov_new_values.csv', index = False)

# 4. Cov new_values log

In [20]:
data_new_valueslog = pd.read_csv('../data/process_data/new_values_log.csv')

In [21]:
cov_new_values = apply_covariance_outliers(data_new_valueslog,covarianzas)
best_new_values = cov_new_values.loc[cov_new_values.groupby('State')['score_modelo_test'].idxmax()]
best_new_values

Unnamed: 0,Version_cov,State,num_condicion,determinante,score_modelo_test,score_modelo_train
5,cov_oas,FL,1120.242632,2.688069e-25,0.916855,0.984943
47,cov_oas,GA,1120.242632,2.688069e-25,0.928217,0.98576
26,cov_oas,NC,1120.242632,2.688069e-25,0.933983,0.984886
10,cov_CLW,NJ,1030.608646,3.9819620000000003e-25,0.931371,0.985314
14,no_cov,NY,0.0,0.0,0.917117,0.984779
28,no_cov,SC,0.0,0.0,0.926596,0.988003
41,cov_epm,VA,5857.659768,1.072227e-27,0.920582,0.985548


In [22]:
cov_data_new_values = apply_best_covariance_method(data_new_values,best_new_values,covarianzas)
cov_data_new_values.groupby('State').count()

Unnamed: 0_level_0,MedianRooms,MedianAge,Households,MedianHouseholdIncome,MedianFamilyIncome,PerCapitaIncome,MedianNonfamilyIncome,LandAreakm2,Latitude,Areakm2,WaterAreakm2,CloudCover,HoursofSun,Precipitation,WindSpeed,Desempleo,Ti,pca1,pca2,MedianPrice
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
FL,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61
GA,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154
NC,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97
NJ,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
NY,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56
SC,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45,45
VA,88,88,88,88,88,88,88,88,88,88,88,88,88,88,88,88,88,88,88,88


In [23]:
cov_data_new_values.to_csv('../data/process_data/cov_new_values_log.csv', index = False)