In [None]:
# ============================================================
# 1. Imports
# ============================================================
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report

# ============================================================
# 2. Carga de datos
# ============================================================
calendar  = pd.read_csv("calendar.csv.gz", compression="gzip")
listings  = pd.read_csv("listings.csv.gz", compression="gzip")

# ============================================================
# 3. Limpieza de LISTINGS
# ============================================================
cols_to_drop = [
    'host_name','listing_url','picture_url',
    'host_url','host_thumbnail_url','host_picture_url',
    'scrape_id','host_id','last_scraped',
    'source','host_neighbourhood','neighbourhood',
    'bathrooms_text','calendar_updated','calendar_last_scraped',
    'minimum_nights','maximum_nights','minimum_minimum_nights',
    'maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights',
    'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms','host_location'
]

# Nos quedamos solo con listings que tengan precio definido
if 'price' in listings.columns:
    listings = listings.dropna(subset=["price"])

    # Convertimos price a numérico (quita símbolos de moneda y comas)
    listings["price"] = (
        listings["price"].astype(str)
        .str.replace(r'[\$,]', '', regex=True)
        .replace('nan', np.nan)
        .astype(float)
    )

# Drop de columnas irrelevantes
listings.drop(columns=[c for c in cols_to_drop if c in listings.columns],
              inplace=True, errors="ignore")

# ============================================================
# 4. TF-IDF sobre texto de listings
#    (description + neighborhood_overview + host_about)
# ============================================================
text_cols = ['description', 'neighborhood_overview', 'host_about']
text_cols_existing = [c for c in text_cols if c in listings.columns]

combined_text = (
    listings[text_cols_existing]
    .fillna('')
    .agg(' '.join, axis=1)
)

tfidf = TfidfVectorizer(max_features=100)
X_tfidf = tfidf.fit_transform(combined_text)

# Quitamos columnas de texto originales
listings.drop(columns=text_cols_existing, inplace=True, errors="ignore")

# Añadimos las 100 features TF-IDF como columnas numéricas
tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    index=listings.index,
    columns=[f"tfidf_{i}" for i in range(X_tfidf.shape[1])]
)

listings_merged = pd.concat([listings, tfidf_df], axis=1)

# ============================================================
# 5. Limpieza de CALENDAR y creación del target "occupied"
# ============================================================
# available: 't' (libre), 'f' (ocupado)
calendar['occupied'] = (calendar['available'] == 'f').astype(int)

# Convertimos date de string a datetime
calendar['date'] = pd.to_datetime(calendar['date'], format='%Y-%m-%d', errors='coerce')

# (Opcional) Limpieza de price en calendar, si existe
if 'price' in calendar.columns:
    calendar["price"] = (
        calendar["price"].astype(str)
        .str.replace(r'[\$,]', '', regex=True)
        .replace('nan', np.nan)
        .astype(float)
    )

# ============================================================
# 6. JOIN calendar + listings (clave: listing_id ↔ id)
# ============================================================
df = calendar.merge(
    listings_merged,
    how='left',
    left_on='listing_id',
    right_on='id',
    suffixes=('_cal', '_list')   # por si hay nombres repetidos
)

# Eliminamos la columna 'id' duplicada (ya tenemos 'listing_id')
if 'id' in df.columns:
    df.drop(columns=['id'], inplace=True)

# ============================================================
# 7. One-Hot Encoding de neighbourhood_group_cleansed
#    y drop de neighbourhood_cleansed
# ============================================================
if 'neighbourhood_group_cleansed' in df.columns:
    df['neighbourhood_group_cleansed'] = df['neighbourhood_group_cleansed'].fillna('Unknown')
    df = pd.get_dummies(
        df,
        columns=['neighbourhood_group_cleansed'],
        prefix='neigh_group',
        drop_first=False
    )

# Eliminamos neighbourhood_cleansed si existe
if 'neighbourhood_cleansed' in df.columns:
    df.drop(columns=['neighbourhood_cleansed'], inplace=True)

# ============================================================
# 8. Features de fecha (dayofweek, month, is_weekend)
# ============================================================
df['dayofweek']  = df['date'].dt.dayofweek
df['month']      = df['date'].dt.month
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

# Ya no necesitamos la fecha cruda como string/datetime para el modelo
df.drop(columns=['date'], inplace=True)

# ============================================================
# 9. Construir X, y para el modelo
# ============================================================
# Target
y = df['occupied']

# Columnas a NO usar como features
cols_to_exclude = [
    'occupied',
    'available',
    'listing_id'
]

X = df.drop(columns=[c for c in cols_to_exclude if c in df.columns])

# Nos quedamos solo con numéricas (árbol + TF-IDF + OHE ya son numéricos)
X = X.select_dtypes(include=[np.number])

print("Dimensiones finales de X, y:", X.shape, y.shape)

# ============================================================
# 10. Train/Test split
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # recomendable si hay desbalance
)

# ============================================================
# 11. Pipeline: Imputer + DecisionTreeClassifier
# ============================================================
tree_clf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('tree', DecisionTreeClassifier(
        criterion="gini",
        max_depth=None,        # puedes ajustar: p.ej. 5, 10, etc.
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42
    ))
])

tree_clf.fit(X_train, y_train)

# ============================================================
# 12. Evaluación del modelo
# ============================================================
y_pred = tree_clf.predict(X_test)

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification report:")
print(classification_report(y_test, y_pred))

print("Train accuracy:", tree_clf.score(X_train, y_train))
print("Test accuracy:", tree_clf.score(X_test, y_test))


FileNotFoundError: [Errno 2] No such file or directory: 'calendar.csv.gz'