In [1]:
# Nos conectamos a nuestro conjunto de datos

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline

cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

In [7]:
df_test = pd.read_csv('./drive/MyDrive/deep-learning-rogelio/testAttrX.csv', sep=';', decimal='.')
df_test.shape

(250, 22)

In [8]:

# Añadir al objeto outliers las filas del dataframe que, basados en la columna Montly Price, se consideran outliers
outliers = df_test[df_test['Monthly Price'] > 7000]
print(outliers.index.to_list())
# Obtenemos las filas del dataframe que, basados en la columna Cleaning Fee, se consideran outliers
outliers = outliers.append(df_test[df_test['Cleaning Fee'] > 299])
# Obtenemos las filas del dataframe que, basados en la columna Extra People, se consideran outliers
outliers = outliers.append(df_test[df_test['Extra People'] > 99])
outliers

[]


Unnamed: 0,Host Listings Count,Host Total Listings Count,Accommodates,Bathrooms,Bedrooms,Beds,Price,Weekly Price,Monthly Price,Security Deposit,...,Extra People,Availability 365,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Location,Calculated host listings count,Host Since in Days,Entire home/apt,Private room


In [9]:
# Eliminamos los outliers, no son muchas filas
# df_test = df_test.drop(outliers.index.to_list())

# Reemplazamos el valor con la media
df_test.loc[outliers.index.to_list(), 'Extra People'] = df_test['Extra People'].mean()

In [10]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
# Hacemos loo mismoo para las siguientes variables
# Obtenemos en una lista las variables que queremos imputar a partir de la variable corr
# lista_columnas = corr['Square Feet'].sort_values(ascending=False).index.tolist()
# Vamos a obtener la lista de los porcentajes de valores faltantes ordenados de menor a mayor, vamos a ir recuperando datos de forma escalonada
lista_columnas = df_test.isnull().sum().sort_values(ascending=True).index.tolist()
# Vamos a recorrer la lista y conforme lo hagamos vamos a ir imputando los valores faltantes
corr = np.abs(df_test.corr())
# Hacemos una copia de nuestro dataframe
df_for_imputation = df_test.copy()
for columna in lista_columnas:
    columnas_relacionadas = corr[columna].sort_values(ascending=False).index.tolist()
    # Tomamos las columnas que tienen una correlación mayor a .39
    columnas_relacionadas = [x for x in columnas_relacionadas if corr[columna][x] > .29]
    # Tomamos las columnas que no son la columna que estamos analizando
    # columnas_relacionadas = [x for x in columnas_relacionadas if x != columna]
    print(f'Evaluando la columna {columna} con las variables relacionadas: {columnas_relacionadas}')
    # Creamos un dataframe con las variables que queremos imputar
    df_knn = df_for_imputation.filter(columnas_relacionadas, axis=1).copy()
    # Normalizamos los datos
    df_knn = pd.DataFrame(scaler.fit_transform(df_knn), columns = df_knn.columns)
    # Define KNN imputer and fill missing values
    knn_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
    df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_knn), columns=df_knn.columns)
    # Denormalize the data
    df_knn_imputed = pd.DataFrame(scaler.inverse_transform(df_knn_imputed), columns = df_knn_imputed.columns)
    # Asignamos los valores imputados a la variable original
    df_for_imputation[columna] = df_knn_imputed[columna]

Evaluando la columna Host Listings Count con las variables relacionadas: ['Host Listings Count', 'Host Total Listings Count', 'Calculated host listings count', 'Cleaning Fee', 'Monthly Price', 'Bathrooms']
Evaluando la columna Host Since in Days con las variables relacionadas: ['Host Since in Days']
Evaluando la columna Calculated host listings count con las variables relacionadas: ['Calculated host listings count', 'Host Listings Count', 'Host Total Listings Count', 'Cleaning Fee', 'Monthly Price']
Evaluando la columna Availability 365 con las variables relacionadas: ['Availability 365']
Evaluando la columna Extra People con las variables relacionadas: ['Extra People', 'Guests Included', 'Accommodates']
Evaluando la columna Guests Included con las variables relacionadas: ['Guests Included', 'Accommodates', 'Beds', 'Bedrooms', 'Extra People', 'Price', 'Entire home/apt', 'Private room', 'Weekly Price']
Evaluando la columna Entire home/apt con las variables relacionadas: ['Entire home/ap

In [11]:
import pandas as pd

def detect_rows_with_all_zero_values(df):
    # Find the rows with all zero values
    rows_with_zero = (df == 0).all(axis=1)
    # Find the indices of those rows
    indices = np.where(rows_with_zero == True)[0]
    return indices

indices = detect_rows_with_all_zero_values(df_for_imputation)
indices



array([], dtype=int64)

In [12]:
def detect_rows_with_all_missing_values(df):
    # Find the rows with all missing values
    rows_with_missing = df.isnull().all(axis=1)
    # Find the indices of those rows
    indices = np.where(rows_with_missing == True)[0]
    return indices

indices = detect_rows_with_all_missing_values(df_for_imputation)
indices

array([], dtype=int64)

In [13]:
df_for_imputation[df_for_imputation.isnull().any(axis=1)]

Unnamed: 0,Host Listings Count,Host Total Listings Count,Accommodates,Bathrooms,Bedrooms,Beds,Price,Weekly Price,Monthly Price,Security Deposit,...,Extra People,Availability 365,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Location,Calculated host listings count,Host Since in Days,Entire home/apt,Private room


In [14]:
df_for_imputation.shape

(250, 22)

In [15]:
# comparado con el anterior se pierde correlación, pero ya tenemos todos los datos completos
#train_data = df_for_imputation.dropna()
df_for_imputation.isna().mean()*100

Host Listings Count               0.0
Host Total Listings Count         0.0
Accommodates                      0.0
Bathrooms                         0.0
Bedrooms                          0.0
Beds                              0.0
Price                             0.0
Weekly Price                      0.0
Monthly Price                     0.0
Security Deposit                  0.0
Cleaning Fee                      0.0
Guests Included                   0.0
Extra People                      0.0
Availability 365                  0.0
Review Scores Rating              0.0
Review Scores Accuracy            0.0
Review Scores Cleanliness         0.0
Review Scores Location            0.0
Calculated host listings count    0.0
Host Since in Days                0.0
Entire home/apt                   0.0
Private room                      0.0
dtype: float64

In [16]:
# Guardamos y copiamos a nuestro drive
df_for_imputation.to_csv('./test_data.csv', sep=';', decimal='.', index=False)
!cp test_data.csv /content/drive/My\ Drive/deep-learning-rogelio/test_data.csv