In [7]:
import pandas as pd
import os

In [8]:
final_data_dir = "data/final"
merged_path = os.path.join(final_data_dir, "merged_victims_and_crimes.csv")

results_dir = "results"
lag_analysis_path = os.path.join(results_dir, "lag_analysis.txt")
iucr_analysis_path = os.path.join(results_dir, "iucr_inconsistency_analysis.txt")
ethical_agg_analysis_path = os.path.join(results_dir, "ethical_aggregate_analysis.txt")

### --- Analysis Functions ---

These are 3 functions that help us to analyze specific parts of our research questions to best understand the datasets and its results :
- **analyze_time_lag:** Compares the 'updated' lag from the Victims dataset vs. the 'updated_on' lag from the Crimes dataset to ansnwer research question 1 (How does the time delay between a crime occurring and the public data update differ between the broad “Crimes - 2001 to Present” dataset and the specialized “Violence Reduction - Victims of Homicides and Non-Fatal Shootings” dataset?)
- **analyze_iucr_inconsistencies:** Identifies inconsistencies between the iucr variables in both datasets to answer research question 7 (Do a crime's final classification codes (i.e. IUCR codes) show inconsistencies between the "Crimes - 2001 to Present” dataset and the records in the "Violence Reduction - Victims of Homicides and Non-Fatal Shootings" dataset for the same event?)
- **analyze_generalized_data:** Performs an aggregate analysis on the generalized (safe) fields of the data to ensure it has been ethically handled.

In [15]:
def analyze_time_lag(df):

    print("Running Time Lag Analysis...")
    
    # Convert all date columns 
    df['date_victim'] = pd.to_datetime(df['date_victim'], errors='coerce')
    df['updated'] = pd.to_datetime(df['updated'], errors='coerce')
    
    df['date_incident'] = pd.to_datetime(df['date_incident'], errors='coerce')
    df['updated_on'] = pd.to_datetime(df['updated_on'], errors='coerce')

    # Calculate lags in days
    df['victim_data_lag'] = (df['updated'] - df['date_victim']).dt.days
    df['incident_data_lag'] = (df['updated_on'] - df['date_incident']).dt.days
    
    # Get descriptive statistics for each lag
    victim_lag_stats = df['victim_data_lag'].describe()
    incident_lag_stats = df['incident_data_lag'].describe()

    # 4. Save results to a text file
    with open(lag_analysis_path, 'w') as f:
        f.write("--- Time Lag Analysis ---\n\n")
        
        f.write("Analysis of 'Violence Reduction - Victims' Dataset Lag:\n")
        f.write("(Lag = 'updated' date - 'date_victim')\n")
        f.write(str(victim_lag_stats))
        f.write("\n\n")
        
        f.write("Analysis of 'Crimes - 2001 to Present' Dataset Lag:\n")
        f.write("(Lag = 'updated_on' date - 'date_incident')\n")
        f.write(str(incident_lag_stats))
    
    print(f"Results saved to '{lag_analysis_path}'")

In [16]:
def analyze_iucr_inconsistencies(df):
    print("IUCR Inconsistency Analysis...")
    
    # Identify the two columns to compare
    victim_col = 'victimization_iucr_cd'
    incident_col = 'iucr'
    
    # Find all rows where they are inconsistent (and both are non-null)
    mismatches_df = df[
        df[victim_col].notna() &
        df[incident_col].notna() &
        (df[victim_col] != df[incident_col])
    ]
    
    num_mismatches = len(mismatches_df)
    total_comparable = len(df[df[victim_col].notna() & df[incident_col].notna()])

    # 3. Find the most common mismatch pairs
    mismatch_pairs = mismatches_df.groupby([victim_col, incident_col]).size().reset_index(name='count')
    mismatch_pairs = mismatch_pairs.sort_values(by='count', ascending=False)

    # 4. Save results to a text file
    with open(iucr_analysis_path, 'w') as f:
        f.write("--- IUCR Code Inconsistency Analysis ---\n\n")
        f.write(f"Comparing: '{victim_col}' (from Victims) vs. '{incident_col}' (from Crimes)\n\n")
        
        f.write(f"Total rows with comparable IUCR codes: {total_comparable}\n")
        f.write(f"Total rows with mismatched IUCR codes: {num_mismatches}\n")
        f.write(f"Percentage of mismatch: {(num_mismatches / total_comparable) * 100:.2f}%\n\n")
        
        f.write("--- Top 20 Most Common Mismatches ---\n")
        f.write(f"({'Victim IUCR':<15} | {'Incident IUCR':<15} | {'Count':<10})\n")
        f.write("-" * 45 + "\n")
        for index, row in mismatch_pairs.head(20).iterrows():
            f.write(f"({row[victim_col]:<15} | {row[incident_col]:<15} | {row['count']:<10})\n")

    print(f"Results saved to '{iucr_analysis_path}'")

In [17]:
def analyze_generalized_data(df):

    print("Ethical Aggregate Analysis...")
    
    # Define the generalized fields fixed in clean.ipynb
    generalized_cols = ['community_area_victim', 'age_group', 'sex']

    # Check if the columns exist
    missing_cols = [col for col in generalized_cols if col not in df.columns]
    if missing_cols:
        print(f"ERROR: Missing required generalized columns: {missing_cols}")
        print("This analysis uses the 'safe' columns created in clean.ipynb.")
        return

    # Run a simple aggregate group-by
    agg_counts = df.groupby(generalized_cols).size().reset_index(name='victim_count')
    agg_counts = agg_counts.sort_values(by='victim_count', ascending=False)
    
    # Save the results
    with open(ethical_agg_analysis_path, 'w') as f:
        f.write("--- Ethical Aggregate Analysis Report ---\n\n")
        f.write("This analysis demonstrates the use of generalized fields ('age_group', 'community_area')\n")
        f.write("to perform analysis without using high-risk individual identifiers.\n\n")
        f.write("--- Top 20 Most Frequent Victim Groups (Aggregated) ---\n")
        f.write(agg_counts.head(20).to_string())

    print(f"The results were saved to '{ethical_agg_analysis_path}'")

## ------ Main Analysis Function ------

This is the main script execution that loads the final merged data from `merged_victims_and_crimes.cv` and runs all the declared analysis functions.

In [18]:
def main():
    print("--- Starting Data Analysis ---")
    
    os.makedirs(results_dir, exist_ok=True)
    
    print(f"Loading merged dataset from '{merged_path}'...")
    try:
        merged_df = pd.read_csv(merged_path, low_memory=False)
        print(f"The merged dataloaded ({len(merged_df)} rows).")
    except FileNotFoundError:
        print("ERROR: The merged data file was not found. Run 'integrate.ipynb' first.")
        return
    except Exception as e:
        print(f"ERROR loading the merged data: {e}")
        return

    # Run all analyses
    analyze_time_lag(merged_df.copy())
    analyze_iucr_inconsistencies(merged_df.copy())
    analyze_generalized_data(merged_df.copy())
    
    print("\n--- Data Analysis Complete ---")
    print(f"All the results were saved in '{results_dir}' folder.")

In [19]:
if __name__ == "__main__":
    main()

--- Starting Data Analysis ---
Loading merged dataset from 'data/final/merged_victims_and_crimes.csv'...
The merged dataloaded (64536 rows).
Running Time Lag Analysis...
Results saved to 'results/lag_analysis.txt'
IUCR Inconsistency Analysis...
Results saved to 'results/iucr_inconsistency_analysis.txt'
Ethical Aggregate Analysis...
The results were saved to 'results/ethical_aggregate_analysis.txt'

--- Data Analysis Complete ---
All the results were saved in 'results' folder.
