In [40]:
import json
import zipfile
from collections import defaultdict
from pathlib import Path

# Function to create directory if it doesn't exist
def create_directory(directory):
    if not directory.exists():
        directory.mkdir(parents=True)

try:
    # Load data from files
    rentals_file_path = Path('src/data/data_cleaned/cleaned_rentals_data.json')
    airbnb_file_path = Path('src/data/data_cleaned/cleaned_airbnb_data.json')

    with open(rentals_file_path, 'r') as rental_file:
        rental_data = json.load(rental_file)

    with open(airbnb_file_path, 'r') as airbnb_file:
        airbnb_data = json.load(airbnb_file)

    # Initialize dictionaries
    revenue_per_postcode = defaultdict(list)
    entry_count_per_postcode = defaultdict(int)

    # Process Airbnb data
    for entry in airbnb_data:
        postcode = entry.get('zipcode', None)
        if postcode:
            revenue_per_postcode[postcode].append(entry)
            entry_count_per_postcode[postcode] += 1

    # Process rentals data
    for entry in rental_data:
        postcode = entry.get('postalCode', None)
        if postcode:
            revenue_per_postcode[postcode].append(entry)
            entry_count_per_postcode[postcode] += 1

    # Calculate average revenue per house per postcode
    average_revenue_per_postcode = {}
    for postcode, entries in revenue_per_postcode.items():
        revenues = [entry.get('rent', entry.get('price')) for entry in entries if '_id' in entry]
        if revenues:
            average_revenue_per_postcode[postcode] = sum(revenues) / len(revenues)

    # Prepare the JSON output
    output_list = []
    for postcode, avg_revenue in average_revenue_per_postcode.items():
        formatted_avg_revenue = f"€{avg_revenue:.1f}"
        entry_count = entry_count_per_postcode[postcode]

        ids = [entry['_id'] for entry in revenue_per_postcode[postcode] if '_id' in entry]
        ids_str = ", ".join([f'"{id}"' for id in ids])

        output_dict = {
            "postalcode": postcode,
            "avg": formatted_avg_revenue,
            "_id": f"[{ids_str}]",
            "entry_count": entry_count
        }
        output_list.append(output_dict)

    # Define output file paths
    output_directory = Path('src/data/calculations/')
    output_file_path = output_directory / 'avg_revenue_per_hse_per_postcode.json'
    zip_file_path = output_directory / 'avg_revenue_per_hse_per_postcode.zip'

    # Create directories if they don't exist
    create_directory(output_directory)

    # Write the JSON output to a file
    with open(output_file_path, 'w') as output_file:
        json.dump(output_list, output_file)

    # Zip the JSON file
    with zipfile.ZipFile(zip_file_path, 'w') as zip_file:
        zip_file.write(output_file_path, arcname=output_file_path.name)

    # Delete the JSON file
    output_file_path.unlink()

    print("Zip file created successfully:", zip_file_path)

except Exception as e:
    print("An error occurred:", e)


Zip file created successfully: src/data/calculations/avg_revenue_per_hse_per_postcode.zip
