<a href="https://colab.research.google.com/github/qorah/vic-edu-housing-insights/blob/develop/Rental_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Upload your Excel file from your local machine.
from google.colab import files
uploaded = files.upload()

# Assuming your file is named something like "Moving-annual-rent-by-suburb-September-quarter-2024 (1).xlsx"
# If the file name differs, update the file_path variable in the next cell accordingly.
for filename in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=filename, length=len(uploaded[filename])))


Saving Moving-annual-rent-by-suburb-September-quarter-2024.xlsx to Moving-annual-rent-by-suburb-September-quarter-2024.xlsx
User uploaded file "Moving-annual-rent-by-suburb-September-quarter-2024.xlsx" with length 1113826 bytes


In [None]:
# Cell 2: Define the modular data processing pipeline.
import pandas as pd
import re
from datetime import datetime
import logging
import sys

# Configure logging for structured output and debugging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def read_data(file_path: str, sheet_name: str, header: int = 1) -> pd.DataFrame:
    """
    Reads the Excel file from the given file_path and sheet_name.

    Parameters:
        file_path (str): Path to the Excel file.
        sheet_name (str): The name of the sheet to read.
        header (int): Row number to use as the header (default is 1).

    Returns:
        pd.DataFrame: The loaded DataFrame.
    """
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name, header=header)
        logging.info(f"Data successfully read from {file_path}")
        logging.info(f"Initial data shape: {df.shape}")
        return df
    except Exception as e:
        logging.error(f"Error reading data from {file_path}: {e}")
        sys.exit(1)

def extract_date_columns(df: pd.DataFrame) -> list:
    """
    Identifies and returns columns with date information based on month abbreviations.

    Parameters:
        df (pd.DataFrame): The DataFrame to search for date-based columns.

    Returns:
        list: A list of column names containing date information.
    """
    date_cols = []
    for col in df.columns:
        col_str = str(col)
        if any(month in col_str for month in ['Mar', 'Jun', 'Sep', 'Dec']):
            date_cols.append(col)
    logging.info(f"Found {len(date_cols)} date columns.")
    return date_cols

def transform_data(df: pd.DataFrame, date_cols: list) -> pd.DataFrame:
    """
    Transforms the raw DataFrame into a structured format with the following columns:
    'Area', 'Suburb', 'Date', 'Count', 'Median', and 'Transformation_Timestamp'.

    Parameters:
        df (pd.DataFrame): The raw input DataFrame.
        date_cols (list): List of columns that contain date-based values.

    Returns:
        pd.DataFrame: A transformed DataFrame in a long/analyst-friendly format.
    """
    result_data = []

    # Iterate over each row; assume the first two rows are non-data header rows.
    for idx, row in df.iterrows():
        if idx < 2:
            continue  # Skip non-data header rows

        # Extract location information with defaults
        area = row.iloc[0] if pd.notna(row.iloc[0]) else "Unknown"
        suburb = row.iloc[1] if pd.notna(row.iloc[1]) else "Unknown"

        # For initial debugging (optional)
        if idx < 5:
            logging.debug(f"Row {idx}: Area = {area}, Suburb = {suburb}")

        # Group date columns by base date (e.g., "Mar 2024")
        date_groups = {}
        for date_col in date_cols:
            date_str = str(date_col)
            match = re.search(r'(Mar|Jun|Sep|Dec)\s+(\d{4})', date_str)
            if match:
                month, year = match.groups()
                base_date = f"{month} {year}"
                if base_date not in date_groups:
                    date_groups[base_date] = {'count_col': None, 'median_col': None}
                # Determine if the column is for median (using a numeric pattern) or count
                if re.search(r'\d+\.\d+$', date_str):
                    date_groups[base_date]['median_col'] = date_col
                else:
                    date_groups[base_date]['count_col'] = date_col

        # Process each date group and extract values
        for base_date, cols in date_groups.items():
            count_value = row[cols['count_col']] if cols['count_col'] is not None else None
            median_value = row[cols['median_col']] if cols['median_col'] is not None else None

            # Exclude rows where the value is simply the header label "Count" or "Median"
            if ((pd.notna(count_value) and not (isinstance(count_value, str) and count_value.strip() == "Count"))
                or (pd.notna(median_value) and not (isinstance(median_value, str) and median_value.strip() == "Median"))):
                result_data.append({
                    'Area': area,
                    'Suburb': suburb,
                    'Date': base_date,  # Will be converted to datetime later
                    'Count': count_value,
                    'Median': median_value,
                    # Add a metadata column with the transformation timestamp
                    'Transformation_Timestamp': datetime.utcnow()
                })

    logging.info(f"Transformed data into {len(result_data)} rows.")
    return pd.DataFrame(result_data)

def clean_data(result_df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the transformed DataFrame by converting the 'Date' column to datetime,
    formatting the 'Median' column as currency, and sorting the data.

    Parameters:
        result_df (pd.DataFrame): The transformed DataFrame.

    Returns:
        pd.DataFrame: The cleaned and sorted DataFrame.
    """
    try:
        # Convert the 'Date' column to datetime; non-convertible values become NaT
        result_df['Date'] = pd.to_datetime(result_df['Date'], format='%b %Y', errors='coerce')
        # Format the 'Median' column as currency where applicable
        result_df['Median'] = result_df['Median'].apply(lambda x: f"${x}" if pd.notna(x) else None)
        # Sort the DataFrame by 'Area', 'Suburb', and 'Date'
        result_df = result_df.sort_values(['Area', 'Suburb', 'Date'])
        logging.info("Data cleaning completed successfully.")
        return result_df
    except Exception as e:
        logging.error(f"Error during data cleaning: {e}")
        sys.exit(1)

def validate_data(result_df: pd.DataFrame) -> pd.DataFrame:
    """
    Validates the data quality and enforces schema rules. This function can be extended
    to integrate with validation frameworks like Great Expectations.

    Parameters:
        result_df (pd.DataFrame): The cleaned DataFrame.

    Returns:
        pd.DataFrame: The validated DataFrame.
    """
    # Example validation: Check that all dates were successfully converted
    if result_df['Date'].isnull().any():
        logging.warning("Some rows have invalid Date values after conversion.")
    # Additional schema checks can be implemented here (e.g., numeric type validations)

    logging.info("Data validation completed.")
    return result_df

def save_data(df: pd.DataFrame, output_path: str) -> None:
    """
    Saves the DataFrame to a CSV file and logs metadata such as row counts.

    Parameters:
        df (pd.DataFrame): The DataFrame to save.
        output_path (str): The file path where the CSV should be saved.
    """
    try:
        df.to_csv(output_path, index=False)
        logging.info(f"Data saved successfully to {output_path}. Total rows: {df.shape[0]}")
    except Exception as e:
        logging.error(f"Error saving data to {output_path}: {e}")
        sys.exit(1)

def main(file_path: str):
    """
    Main pipeline function that executes all steps:
    reading, extracting, transforming, cleaning, validating, and saving data.

    Parameters:
        file_path (str): The path to the Excel file.
    """
    sheet_name = 'All properties'
    output_file = 'cleaned_data.csv'

    # Step 1: Read Data
    df = read_data(file_path, sheet_name, header=1)

    # Step 2: Extract Date-Based Columns
    date_cols = extract_date_columns(df)

    # Step 3: Transform Data into a Structured Format
    transformed_df = transform_data(df, date_cols)

    # Step 4: Clean the Transformed Data
    cleaned_df = clean_data(transformed_df)

    # Step 5: Validate Data Quality
    validated_df = validate_data(cleaned_df)

    # Step 6: Save the Cleaned Data
    save_data(validated_df, output_file)

    # Display the cleaned DataFrame for inspection
    print("Final cleaned DataFrame preview:")
    print(validated_df.head())

# For Colab, determine the uploaded file name.
# Update this variable if your uploaded file name differs.
uploaded_file_name = list(uploaded.keys())[0]

# Run the pipeline with the uploaded file.
main(uploaded_file_name)


Final cleaned DataFrame preview:
           Area    Suburb       Date Count Median   Transformation_Timestamp
12474  Ballarat  Ballarat 2000-03-01   995   $135 2025-03-17 04:39:48.985251
12475  Ballarat  Ballarat 2000-06-01   981   $135 2025-03-17 04:39:48.985263
12476  Ballarat  Ballarat 2000-09-01   979   $138 2025-03-17 04:39:48.985273
12477  Ballarat  Ballarat 2000-12-01   963   $140 2025-03-17 04:39:48.985282
12478  Ballarat  Ballarat 2001-03-01   876   $140 2025-03-17 04:39:48.985292
