<a href="https://colab.research.google.com/github/qorah/vic-edu-housing-insights/blob/develop/Rental_gis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#@title Cell 1: Import Dependencies and Configure Logging
import urllib.request
import json
import pandas as pd
import re
from datetime import datetime
import logging
import sys

# For Colab file upload
from google.colab import files
import io

# Configure logging for structured output and debugging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')


In [4]:
#@title Cell 2: Define Functions for the Data Pipeline

def fetch_school_data(api_url: str) -> pd.DataFrame:
    """
    Fetches school data from the specified API URL and returns a DataFrame.

    Parameters:
        api_url (str): URL for the school API.

    Returns:
        pd.DataFrame: DataFrame containing school records.
    """
    try:
        with urllib.request.urlopen(api_url) as response:
            data = response.read()
        school_json = json.loads(data)
        # Extract records; adjust this based on your API's structure
        records = school_json.get('result', {}).get('records', [])
        school_df = pd.DataFrame(records)
        logging.info(f"Fetched {len(school_df)} school records from the API.")
        return school_df
    except Exception as e:
        logging.error(f"Error fetching school data: {e}")
        sys.exit(1)

#@title Cell 2: Define Functions for the Data Pipeline

# ... (rest of your code)

def read_data(file_path: str, sheet_name: str, header: int = 1) -> pd.DataFrame:
    """
    Reads the data file, which can be either Excel or CSV, based on its extension.

    Parameters:
        file_path (str): Path to the data file.
        sheet_name (str): The name of the sheet to read (only for Excel files).
        header (int): Row number to use as the header (default is 1).

    Returns:
        pd.DataFrame: The loaded DataFrame.
    """
    try:
        if file_path.lower().endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file_path, sheet_name=sheet_name, header=header)
        elif file_path.lower().endswith('.csv'):
            df = pd.read_csv(file_path, header=header)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")  # Handle unsupported formats

        logging.info(f"Data successfully read from {file_path}")
        logging.info(f"Initial data shape: {df.shape}")
        return df
    except Exception as e:
        logging.error(f"Error reading data from {file_path}: {e}")
        sys.exit(1)

# ... (rest of your code)

def extract_date_columns(df: pd.DataFrame) -> list:
    """
    Identifies and returns columns with date information based on month abbreviations.

    Parameters:
        df (pd.DataFrame): The DataFrame to search for date-based columns.

    Returns:
        list: A list of column names containing date information.
    """
    date_cols = []
    for col in df.columns:
        col_str = str(col)
        if any(month in col_str for month in ['Mar', 'Jun', 'Sep', 'Dec']):
            date_cols.append(col)
    logging.info(f"Found {len(date_cols)} date columns.")
    return date_cols

def transform_data(df: pd.DataFrame, date_cols: list) -> pd.DataFrame:
    """
    Transforms the raw DataFrame into a structured format with the following columns:
    'Area', 'Suburb', 'Date', 'Count', 'Median', and 'Transformation_Timestamp'.

    Parameters:
        df (pd.DataFrame): The raw input DataFrame.
        date_cols (list): List of columns that contain date-based values.

    Returns:
        pd.DataFrame: A transformed DataFrame in a long/analyst-friendly format.
    """
    result_data = []
    last_known_area = None  # Track the last valid area

    # Iterate over each row without skipping initial rows
    for idx, row in df.iterrows():
        # Update area only when a new one is encountered
        if pd.notna(row.iloc[0]):
            last_known_area = row.iloc[0]

        # Use the last known area instead of "Unknown"
        area = last_known_area if last_known_area is not None else "Unknown"
        suburb = row.iloc[1] if pd.notna(row.iloc[1]) else "Unknown"

        # For initial debugging (optional)
        if idx < 5:
            logging.debug(f"Row {idx}: Area = {area}, Suburb = {suburb}")

        # Group date columns by base date (e.g., "Mar 2024")
        date_groups = {}
        for date_col in date_cols:
            date_str = str(date_col)
            match = re.search(r'(Mar|Jun|Sep|Dec)\s+(\d{4})', date_str)
            if match:
                month, year = match.groups()
                base_date = f"{month} {year}"
                if base_date not in date_groups:
                    date_groups[base_date] = {'count_col': None, 'median_col': None}
                # Determine if the column is for median (using a numeric pattern) or count
                if re.search(r'\d+\.\d+$', date_str):
                    date_groups[base_date]['median_col'] = date_col
                else:
                    date_groups[base_date]['count_col'] = date_col

        # Process each date group and extract values
        for base_date, cols in date_groups.items():
            count_value = row[cols['count_col']] if cols['count_col'] is not None else None
            median_value = row[cols['median_col']] if cols['median_col'] is not None else None

            # Exclude rows where the value is simply the header label "Count" or "Median"
            if ((pd.notna(count_value) and not (isinstance(count_value, str) and count_value.strip() == "Count"))
                or (pd.notna(median_value) and not (isinstance(median_value, str) and median_value.strip() == "Median"))):
                result_data.append({
                    'Area': area,
                    'Suburb': suburb,
                    'Date': base_date,  # Will be converted to datetime later
                    'Count': count_value,
                    'Median': median_value,
                    # Add a metadata column with the transformation timestamp
                    'Transformation_Timestamp': datetime.utcnow()
                })

    logging.info(f"Transformed data into {len(result_data)} rows.")
    return pd.DataFrame(result_data)

def clean_data(result_df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the transformed DataFrame by converting the 'Date' column to datetime,
    formatting the 'Median' column as currency, and sorting the data.

    Parameters:
        result_df (pd.DataFrame): The transformed DataFrame.

    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    try:
        result_df['Date'] = pd.to_datetime(result_df['Date'], format='%b %Y', errors='coerce')
        result_df['Median'] = result_df['Median'].apply(lambda x: f"${x}" if pd.notna(x) else None)
        result_df = result_df.sort_values(['Area', 'Suburb', 'Date'])
        logging.info("Data cleaning completed successfully.")
        return result_df
    except Exception as e:
        logging.error(f"Error during data cleaning: {e}")
        sys.exit(1)

def validate_data(result_df: pd.DataFrame) -> pd.DataFrame:
    """
    Validates the data quality and enforces schema rules.

    Parameters:
        result_df (pd.DataFrame): The cleaned DataFrame.

    Returns:
        pd.DataFrame: The validated DataFrame.
    """
    if result_df['Date'].isnull().any():
        logging.warning("Some rows have invalid Date values after conversion.")
    logging.info("Data validation completed.")
    return result_df

def save_data(df: pd.DataFrame, output_path: str) -> None:
    """
    Saves the DataFrame to a CSV file.

    Parameters:
        df (pd.DataFrame): The DataFrame to save.
        output_path (str): The file path where the CSV should be saved.
    """
    try:
        df.to_csv(output_path, index=False)
        logging.info(f"Data saved successfully to {output_path}. Total rows: {df.shape[0]}")
    except Exception as e:
        logging.error(f"Error saving data to {output_path}: {e}")
        sys.exit(1)

def merge_rent_and_school_data(rent_df: pd.DataFrame, school_df: pd.DataFrame) -> pd.DataFrame:
    """
    Merges the cleaned rent data with school data on a common key.
    Adjust the join key based on the actual column names in the school data.

    Parameters:
        rent_df (pd.DataFrame): Cleaned rent data (assumed to have a 'Suburb' column).
        school_df (pd.DataFrame): School data from the API.

    Returns:
        pd.DataFrame: Merged DataFrame.
    """
    # Print school_df columns to determine available keys.
    logging.info("School DataFrame columns: " + ", ".join(school_df.columns))

    # Determine which column to use as the join key for school data.
    # Change 'locality' to the appropriate column if different.
    if 'locality' in school_df.columns:
        join_key = 'locality'
    elif 'address' in school_df.columns:
        join_key = 'address'
    else:
        logging.error("No suitable join key found in school data. Please check the column names.")
        sys.exit(1)

    merged_df = pd.merge(rent_df, school_df, left_on='Suburb', right_on=join_key, how='left')
    logging.info(f"Merged data shape: {merged_df.shape}")
    return merged_df


In [7]:
#@title Cell 3: Run the Pipeline in Colab

# Step 1: Upload your rent data Excel file using the file uploader.
uploaded = files.upload()

# Assuming the uploaded file is the rent data Excel file.
# If multiple files are uploaded, adjust the key accordingly.
uploaded_file_name = list(uploaded.keys())[0]
logging.info(f"Uploaded file: {uploaded_file_name}")

# Step 2: Run the pipeline for rent data processing
sheet_name = 'All properties'
rent_df_raw = read_data(uploaded_file_name, sheet_name, header=1)
date_cols = extract_date_columns(rent_df_raw)
transformed_df = transform_data(rent_df_raw, date_cols)
cleaned_df = clean_data(transformed_df)
validated_df = validate_data(cleaned_df)
save_data(validated_df, 'cleaned_rent_data.csv')


Saving 2.Moving-annual-rent-by-suburb-September-quarter-2024.xlsx to 2.Moving-annual-rent-by-suburb-September-quarter-2024 (2).xlsx
