In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
raw_data_dir = "data/raw"
crimes_data_raw_path = os.path.join(raw_data_dir, "crimes_2001_to_present.csv")
victims_data_raw_path = os.path.join(raw_data_dir, "violence_reduction_victims.csv")

In [3]:
processed_data_dir = "data/processed"
crimes_data_processed_path = os.path.join(processed_data_dir, "crimes_clean.csv")
victims_data_processed_path = os.path.join(processed_data_dir, "victims_clean.csv")

### --- Helper Functions ---

These are the 3 helper functions we are using to clean both datasets and add enrichment elements into through the creation of new columns that add value:
- **standardize_column_names:** Converts all the columns names to lowercase ones with underscores
- **clean_victims_data:** Applies all cleaning and enrichment steps to the Victims dataset
- **clean_crimes_data:** Applies all cleaning and enrichment steps to the Crimes dataset

In [4]:
def standardize_column_names(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    return df

In [5]:
def clean_victims_data(df):
    print("  Cleaning 'Victims' data...")
    
    # --- Cleaning Key Fields  ---
    if df['case_number'].isnull().sum() > 0:
        df = df.dropna(subset=['case_number'])
    df['case_number'] = df['case_number'].str.strip()
    
    # 1. Clean 'age'
    df['age'] = df['age'].replace('UNKNOWN', pd.NA)
    df['age'] = pd.to_numeric(df['age'], errors='coerce')
    print("    - 'age' cleaned: Replaced 'UNKNOWN' and converted to numeric.")

    # 2. Clean 'sex'
    df['sex'] = df['sex'].str.upper()
    gender_map = {
        "M": "MALE",
        "F": "FEMALE",
        "X": "UNKNOWN/OTHER",
        "Unknown": "UNKNOWN/OTHER"
    }
    df['sex'] = df['sex'].map(gender_map).fillna("UNKNOWN/OTHER")
    print("    - 'sex' cleaned: Standardized to MALE/FEMALE/UNKNOWN-OTHER.")

    # 3. Clean 'race'
    df['race'] = df['race'].str.upper().str.strip()
    race_map = {
        "Black or African American": "BLACK",
        "White": "WHITE",
        "Asian/Pacific Islander": "ASIAN",
    }
    df['race'] = df['race'].map(race_map).fillna("OTHER/UNKNOWN")
    print("    - 'race' cleaned: Standardized common race entries.")
    
    # --- Cleaning Date Fields ---
    date_format = '%m/%d/%Y %I:%M:%S %p'
    df['date'] = pd.to_datetime(df['date'], format=date_format, errors='coerce')
    print("    - 'date' converted to datetime object.")
    
    # --- Enrichment for Privacy/Ethics ---
    # Create a new 'age_group' column for analysis and privacy
    age_bins = [0, 17, 25, 34, 44, 54, 64, 120]
    age_labels = ['0-17 (Minor)', '18-25', '26-34', '35-44', '45-54', '55-64', '65+']
    
    df['age_group'] = pd.cut(df['age'], bins=age_bins,labels=age_labels, right=True)
   
    # Convert to string to avoid issues if you need to save to different formats like Parquet
    df['age_group'] = df['age_group'].astype(str).replace('nan', pd.NA)
    print("    - Enriched 'age' with new 'age_group' column.")

    cols_to_drop = ['age', 'block', 'latitude', 'longitude']
    df = df.drop(columns=cols_to_drop, errors='ignore')
    print("    - Ethics fixing by dropping high-risk QIs to create safe data.")
    
    return df

In [6]:
def clean_crimes_data(df):
    print("  Cleaning 'Crimes' data...")
    
    # --- Cleaninig Key Fields ---
    if df['case_number'].isnull().sum() > 0:
        print(f"    - WARNING: Found {df['case_number'].isnull().sum()} null case numbers. Dropping them.")
        df = df.dropna(subset=['case_number'])
    df['case_number'] = df['case_number'].str.strip()

    # --- Cleaning Date Fields ---
    date_format = '%m/%d/%Y %I:%M:%S %p'
    df['date'] = pd.to_datetime(df['date'], format=date_format, errors='coerce')
    df['updated_on'] = pd.to_datetime(df['updated_on'], format=date_format, errors='coerce')
    print("    - 'date' and 'updated_on' converted to datetime objects.")
    
    # --- Enrichment for Time-Based Features ---
    print("    - Enriching with time-based features...")
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.day_name()
    
    hour = df['date'].dt.hour
    df['time_of_day'] = pd.cut(hour,
                               bins=[-1, 6, 12, 18, 24],
                               labels=['Night', 'Morning', 'Afternoon', 'Evening'])
    df['time_of_day'] = df['time_of_day'].astype(str).replace('nan', pd.NA)
    
    # --- Enrichment for mapping 'iucr' to categories ---

    crime_map = {
        '0110': 'HOMICIDE', '0130': 'HOMICIDE',
        '0261': 'CRIMINAL SEXUAL ASSAULT', '0262': 'CRIMINAL SEXUAL ASSAULT',
        '031A': 'ROBBERY', '031B': 'ROBBERY',
        '041A': 'AGGRAVATED ASSAULT', '041B': 'AGGRAVATED ASSAULT',
        '0460': 'BATTERY', '0486': 'BATTERY',
        '0560': 'ASSAULT',
        '0610': 'BURGLARY', '0620': 'BURGLARY',
        '0810': 'THEFT', '0820': 'THEFT',
        '0910': 'MOTOR VEHICLE THEFT', '0920': 'MOTOR VEHICLE THEFT',
        '141A': 'WEAPONS VIOLATION', '141B': 'WEAPONS VIOLATION',
        '1811': 'NARCOTICS', '1812': 'NARCOTICS'
    }
    
    df['crime_category'] = df['iucr'].map(crime_map).fillna('OTHER')
    print("    - Enriched 'iucr' with high-level 'crime_category'.")

    cols_to_drop = ['block', 'latitude', 'longitude', 'location', 'x_coordinate', 'y_coordinate']
    df = df.drop(columns=cols_to_drop, errors='ignore')
    print("    - Ethics fixing by dropping high-risk QIs (block, lat/lon, coordinates).")
    
    return df

### ---- Main Cleaning Function -----

This function will load, clean, enrich, and save the data

In [7]:
def main():
    print("--- Starting Data Cleaning ---")
    
    os.makedirs(processed_data_dir, exist_ok=True)
    
    # 1. Load
    print(f"Loading raw datasets from '{raw_data_dir}'...")
    try:
        victims_df = pd.read_csv(victims_data_raw_path, low_memory=False)
        crimes_df = pd.read_csv(crimes_data_raw_path, low_memory=False)
        print("Raw datasets loaded.")
    except FileNotFoundError:
        print("ERROR: Raw data files were not found. Run 'acquire.ipynb' first.")
        return
    except Exception as e:
        print(f"ERROR loading raw data: {e}")
        return

    # Standardize
    victims_df_standardize = standardize_column_names(victims_df)
    crimes_df_standardize = standardize_column_names(crimes_df)
    print("Standardized all the column names")

    # Clean & Enrich
    victims_df_clean = clean_victims_data(victims_df_standardize)
    crimes_df_clean = clean_crimes_data(crimes_df_standardize)
    
    # Save
    print(f"Saving processed files to '{processed_data_dir}'...")

    print(f"Columns in *clean victims* file: {victims_df_clean.columns.tolist()}")
    print(f"Columns in *clean crimes* file: {crimes_df_clean.columns.tolist()}")

    victims_df_clean.to_csv(victims_data_processed_path, index=False)
    crimes_df_clean.to_csv(crimes_data_processed_path, index=False)
    
    print("Success: Cleaned and enriched files saved.")
    print("--- Data Cleaning Complete ---")

if __name__ == "__main__":
    main()

--- Starting Data Cleaning ---
Loading raw datasets from 'data/raw'...
Raw datasets loaded.
Standardized all the column names
  Cleaning 'Victims' data...
    - 'age' cleaned: Replaced 'UNKNOWN' and converted to numeric.
    - 'sex' cleaned: Standardized to MALE/FEMALE/UNKNOWN-OTHER.
    - 'race' cleaned: Standardized common race entries.
    - 'date' converted to datetime object.
    - Enriched 'age' with new 'age_group' column.
    - Ethics fixing by dropping high-risk QIs to create safe data.
  Cleaning 'Crimes' data...
    - 'date' and 'updated_on' converted to datetime objects.
    - Enriching with time-based features...
    - Enriched 'iucr' with high-level 'crime_category'.
    - Ethics fixing by dropping high-risk QIs (block, lat/lon, coordinates).
Saving processed files to 'data/processed'...
Columns in *clean victims* file: ['case_number', 'date', 'victimization_primary', 'incident_primary', 'gunshot_injury_i', 'unique_id', 'zip_code', 'ward', 'community_area', 'street_outrea