In [1]:
import pandas as pd
import os

In [2]:
processed_data_dir = "data/processed"
crimes_data_clean_path = os.path.join(processed_data_dir, "crimes_clean.csv")
victims_data_clean_path = os.path.join(processed_data_dir, "victims_clean.csv")

In [3]:
final_data_dir = "data/final"
merged_path = os.path.join(final_data_dir, "merged_victims_and_crimes.csv")

## ------ Main Integration Function ------

This is the main script execution that loads the clean data from `clean.ipynb`, merges the two datasets together, and saves the final integrated dataset into a single dataset called `merged_victims_and_crimes.csv`.

In [4]:
def integrate():
    print("--- Starting Data Integration ---")
    
    # 1. Create output directory
    os.makedirs(final_data_dir, exist_ok=True)
    
    # 2. Load Clean Datasets
    print(f"Loading clean datasets from '{processed_data_dir}'...")
    try:
        victims_df = pd.read_csv(victims_data_clean_path, low_memory=False)
        crimes_df = pd.read_csv(crimes_data_clean_path,low_memory=False)
        print("Clean datasets have been loaded.")
    except FileNotFoundError:
        print("ERROR: The processed data files are not found.")
        print("Run 'clean.ipynb' first.")
        return
    except Exception as e:
        print(f"ERROR loading the cleaned data: {e}")
        return

    # The Integration using a Left Merge
    
    print(f"Integrating 'Victims' ({len(victims_df)} rows) with 'Crimes' ({len(crimes_df)} rows)...")
    
    merged_df = pd.merge(
        left = victims_df,
        right = crimes_df,
        on = 'case_number',
        how = 'left',
        suffixes=('_victim', '_incident')
    )
    
    print(f"The integration is complete. The new dataset has {len(merged_df)} rows.")

    # 4. Check the data post-integration
    
    # Check for 'date' inconsistencies
    date_mismatches = merged_df[
        merged_df['date_victim'].notna() &
        merged_df['date_incident'].notna() &
        (merged_df['date_victim'] != merged_df['date_incident'])
    ]
    print(f"  Information: Found {len(date_mismatches)} rows where victim date != incident date.")
    
    # Check for 'iucr' inconsistencies
    iucr_mismatches = merged_df[
        merged_df['victimization_iucr_cd'].notna() &
        merged_df['iucr'].notna() &
        (merged_df['victimization_iucr_cd'] != merged_df['iucr'])
    ]
    print(f"  Information: Found {len(iucr_mismatches)} rows where victim IUCR != incident IUCR.")

    # 5. Save Final Dataset
    
    print(f"Saving final merged dataset to '{merged_path}'...")
    merged_df.to_csv(merged_path, index=False)
    
    print("The final merged dataset has been saved!")
    print("--- Data Integration Complete ---")

In [5]:
if __name__ == "__main__":
    integrate()

--- Starting Data Integration ---
Loading clean datasets from 'data/processed'...
Clean datasets have been loaded.
Integrating 'Victims' (62812 rows) with 'Crimes' (8443050 rows)...
The integration is complete. The new dataset has 64536 rows.
  Information: Found 13042 rows where victim date != incident date.
  Information: Found 2473 rows where victim IUCR != incident IUCR.
Saving final merged dataset to 'data/final/merged_victims_and_crimes.csv'...
The final merged dataset has been saved!
--- Data Integration Complete ---
