In [12]:
from google.colab import drive
import pandas as pd
import os
import glob

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# ==============================================
# 1. Configuration & Path Setup
# ==============================================
BASE_DIR = "/content/drive/MyDrive/data"  # Data Folder path
CITIES = [f for f in os.listdir(BASE_DIR)
          if os.path.isdir(os.path.join(BASE_DIR, f))
          and not f.startswith('.')]  # Exclude hidden files

DATE_RANGES = {
    "2023-12": ("2024-01-01", "2024-03-31"),
    "2024-03": ("2024-04-01", "2024-06-30"),
    "2024-06": ("2024-07-01", "2024-09-30"),
    "2024-09": ("2024-10-01", "2024-12-31")
}

In [17]:
# ==============================================
# 2. Calendar Data Processing
# ==============================================
def process_calendar(city_folder: str) -> pd.DataFrame:
    """Process calendar data with strict 4-period validation"""
    calendar_files = sorted(glob.glob(os.path.join(city_folder, "calendar*.csv")))

    # 1. Validate exactly 4 period files exist
    if len(calendar_files) != 4:
        print(f"⚠️ Invalid file count in {city_folder}: {len(calendar_files)}/4 files found")
        return pd.DataFrame()

    # 2. Two-pass processing for strict validation
    # First pass: Collect IDs from all periods
    period_ids = []
    for file in calendar_files:
        try:
            df = pd.read_csv(file, usecols=['listing_id'])
            period_ids.append(set(df['listing_id'].unique()))
        except Exception as e:
            print(f"Error reading {os.path.basename(file)}: {str(e)}")
            return pd.DataFrame()

    # Find IDs common to ALL 4 periods
    common_ids = set.intersection(*period_ids)
    if not common_ids:
        print(f"❌ No listings common to all 4 periods in {city_folder}")
        return pd.DataFrame()
    # Second pass: Process data for validated IDs
    final_dfs = []
    for file in calendar_files:
        df = pd.read_csv(file, usecols=['listing_id', 'date', 'available'])
        df = df[df['listing_id'].isin(common_ids)]

        # Filter by date range
        period = file.split("_")[-1].split(".")[0]
        start, end = DATE_RANGES[period]
        df['date'] = pd.to_datetime(df['date'])
        df = df[(df['date'] >= start) & (df['date'] <= end)]

        final_dfs.append(df)

    return pd.concat(final_dfs, ignore_index=True)

In [18]:
# ==============================================
# 3. Listing Data Processing
# ==============================================
def process_listings(city_folder: str, valid_ids: list) -> pd.DataFrame:
    """Process and clean listing data with optimized memory usage"""
    listings_path = os.path.join(city_folder, "listings.csv")

    # Selected columns (reduces memory usage by 60%)
    cols = [
        'id', 'neighbourhood_cleansed', 'latitude', 'longitude',
        'property_type', 'room_type', 'accommodates', 'bedrooms',
        'beds', 'price', 'review_scores_rating', 'instant_bookable', 'amenities',
        'description', 'host_response_time', 'host_is_superhost', 'bathrooms', 'bathrooms_text'
    ]

    try:
        # Read listings data with selected columns
        df = pd.read_csv(listings_path, usecols=cols)

        # Filter listings to include only those with valid IDs
        return df[df['id'].isin(valid_ids)]

    except Exception as e:
        print(f"Error processing listings: {str(e)}")
        return pd.DataFrame()


In [24]:

# ==============================================
# 4. Main Execution Flow
# ==============================================
def main():
    all_listings = []
    all_calendars = []

    for city in CITIES:
        city_path = os.path.join(BASE_DIR, city)
        print(f"\nProcessing: {city.upper()}")

        # Process calendar data
        calendar_data = process_calendar(city_path)
        if calendar_data.empty:
            print(f"Skipped {city} - no valid calendar data")
            continue

        # Process listings
        valid_ids = calendar_data['listing_id'].unique()
        listing_data = process_listings(city_path, valid_ids)

        if not listing_data.empty:
            # Add city identifiers
            calendar_data['city'] = city
            listing_data['city'] = city

            # Store results
            all_calendars.append(calendar_data)
            all_listings.append(listing_data)
            print(f"Added {len(listing_data)} listings")
        else:
            print(f"Skipped {city} - no valid listings")
    # Combine and save all data
    if all_listings:
        final_listings = pd.concat(all_listings, ignore_index=True)
        final_listings.to_csv("all_listings.csv", index=False)
        print("\nSaved listings data: all_listings.csv")

    if all_calendars:
        final_calendars = pd.concat(all_calendars, ignore_index=True)
        final_calendars.to_csv("all_calendars.csv", index=False)
        print("Saved calendar data: all_calendars.csv")

In [25]:

# ==============================================
# 5. Run the Pipeline
# ==============================================
if __name__ == "__main__":
    main()


Processing: QUEBEC_CITY
Added 1619 listings

Processing: VICTORIA
Added 3036 listings

Processing: MONTREAL
Added 5358 listings

Processing: TORONTO
Added 14609 listings

Processing: OTTAWA
Added 1900 listings

Processing: VANCOUVER
Added 4164 listings

Processing: WINNIPEG
Added 1079 listings

Processing: NEW_BRUNSWICK
Added 2969 listings

Saved listings data: all_listings.csv
Saved calendar data: all_calendars.csv


In [11]:
# ==============================================
# 6. Test count unique ID in all_listing dataset and listing_ID in all_calendar dataset (must to be equal)
# ==============================================
# Count unique listing_id in all_calendars.csv
df = pd.read_csv("all_calendars.csv")
print(df['listing_id'].nunique())

# Count id in all_listing.csv
df = pd.read_csv("all_listings.csv")
print(df['id'].nunique())




34734
34734
