In [2]:
# 01_preprocessing.ipynb

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from datetime import datetime

In [8]:
# Load dataset
df = pd.read_csv("../data/listings_combined.csv", low_memory=False)

# Drop irrelevant or high-cardinality columns
columns_to_drop = [
    'id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description',
    'neighborhood_overview', 'picture_url', 'host_url', 'host_name', 'host_about',
    'host_thumbnail_url', 'host_picture_url', 'host_verifications', 'license'
]
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [10]:
# Handle price column: remove "$" and ","
df['price'] = df['price'].replace(r'[\$,]', '', regex=True).astype(float)

# Feature: amenities_count
df['amenities_count'] = df['amenities'].apply(lambda x: len(eval(x)) if pd.notnull(x) else 0)

# Feature: host_experience_years
df['host_since'] = pd.to_datetime(df['host_since'], errors='coerce')
df['host_experience_years'] = datetime.now().year - df['host_since'].dt.year

# Select features of interest
selected_features = [
    'accommodates', 'bedrooms', 'beds', 'amenities_count', 'host_experience_years',
    'latitude', 'longitude', 'property_type', 'room_type', 'price', 'city'
]
df = df[selected_features]

In [12]:
# Drop rows with nulls
df.dropna(inplace=True)

# Encode categorical variables
label_cols = ['property_type', 'room_type', 'city']
for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [14]:
# Scale features (except price)
scaler = StandardScaler()
features_to_scale = ['accommodates', 'bedrooms', 'beds', 'amenities_count', 'host_experience_years', 'latitude', 'longitude']
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Save processed data
df.to_csv("../data/listings_cleaned.csv", index=False)
print("Preprocessing complete. Cleaned data saved.")

Preprocessing complete. Cleaned data saved.
