In [6]:
import pandas as pd
import numpy as np

# Cargo la base limpia con la que hemos venido trabajando
df = pd.read_csv("df_cleaned.csv")

# Aseguro tipos básicos que voy a usar
df["price"] = df["price"].astype(float)
df["accommodates"] = df["accommodates"].astype(int)

# Precio por huésped como indicador de valor relativo
df["price_per_guest"] = df["price"] / df["accommodates"]

# Reconstruyo el borough a partir de las dummies de neighbourhood_group_cleansed
borough_cols = [
    "neighbourhood_group_cleansed:Bronx",
    "neighbourhood_group_cleansed:Brooklyn",
    "neighbourhood_group_cleansed:Manhattan",
    "neighbourhood_group_cleansed:Queens",
    "neighbourhood_group_cleansed:Staten Island"
]

df["borough_seg"] = (
    df[borough_cols]
      .idxmax(axis=1)          # columna que tiene el 1
      .str.split(":", n=1)
      .str[1]                  # me quedo con Bronx, Brooklyn, etc.
)

# Reconstruyo el tipo de habitación a partir de las dummies de room_type
room_cols = [
    "room_type:Entire home/apt",
    "room_type:Hotel room",
    "room_type:Private room",
    "room_type:Shared room"
]

df["room_type_seg"] = (
    df[room_cols]
      .idxmax(axis=1)
      .str.split(":", n=1)
      .str[1]
)

# Reviso rápido que las nuevas columnas tengan sentido
print(df[["price", "accommodates", "price_per_guest", "borough_seg", "room_type_seg"]].head())

# Guardo esta base intermedia para el modelo 3
df.to_csv("df_modelo3_segmentos_base.csv", index=False)


   price  accommodates  price_per_guest borough_seg    room_type_seg
0   66.0             1        66.000000      Queens     Private room
1   76.0             1        76.000000   Manhattan     Private room
2   97.0             6        16.166667      Queens  Entire home/apt
3   60.0             1        60.000000    Brooklyn     Private room
4  425.0             6        70.833333    Brooklyn  Entire home/apt


In [7]:
# Cargo la base intermedia generada en el commit 1
df = pd.read_csv("df_modelo3_segmentos_base.csv")

# Defino el segmento comparable: mismo borough + mismo tipo de habitación
segment_cols = ["borough_seg", "room_type_seg"]

# Para cada segmento calculo:
# - percentil 25 y 75 de price_per_guest (rango razonable de precio por huésped)
# - mediana de amenities_count (mínimo razonable de amenidades)
segment_stats = (
    df.groupby(segment_cols)
      .agg(
          p25_ppg=("price_per_guest", lambda x: np.percentile(x, 25)),
          p75_ppg=("price_per_guest", lambda x: np.percentile(x, 75)),
          med_amenities=("amenities_count", "median")
      )
      .reset_index()
)

# Uno estas estadísticas al dataframe principal
df = df.merge(segment_stats, on=segment_cols, how="left")

# Regla de recomendación:
# - price_per_guest entre p25 y p75 del segmento
# - amenities_count >= med_amenities del segmento
cond_precio = (df["price_per_guest"] >= df["p25_ppg"]) & (df["price_per_guest"] <= df["p75_ppg"])
cond_amenities = df["amenities_count"] >= df["med_amenities"]

df["recommended"] = np.where(cond_precio & cond_amenities, 1, 0)

# Reviso que la etiqueta no quede extremadamente desbalanceada
print("Distribución de la etiqueta recommended:")
print(df["recommended"].value_counts(dropna=False))
print(df["recommended"].value_counts(normalize=True).round(3))

# Guardo la base final para el modelo de clasificación
df.to_csv("df_modelo3_clasificacion.csv", index=False)

Distribución de la etiqueta recommended:
recommended
0    15283
1     5544
Name: count, dtype: int64
recommended
0    0.734
1    0.266
Name: proportion, dtype: float64
