In [None]:
import os
import pandas as pd
from tqdm import tqdm
def read_city_of_london_street_csvs(folder_path, file_pattern):
    """
    Reads all CSV files in the given folder whose filenames contain 'city-of-london-street',
    and concatenates them into a single pandas DataFrame.

    Args:
        folder_path (str): Path to the folder containing the CSV files.

    Returns:
        pd.DataFrame: A single concatenated DataFrame containing all matching CSVs' data.
    """
    # List to collect individual DataFrames
    dataframes = []

    for root, _, files in tqdm(os.walk(folder_path), desc="Reading CSV files"):
        for filename in tqdm(files, desc="Processing files", leave=False):
            if file_pattern in filename and filename.endswith(".csv"):
                file_path = os.path.join(root, filename)
                df = pd.read_csv(file_path)
                dataframes.append(df)

    return (
        pd.concat(dataframes, ignore_index=True)
        if dataframes
        else pd.DataFrame()
    )

# Example usage (uncomment to use):
combined_df = read_city_of_london_street_csvs(os.path.join("data", "relevant_data"), "street")

In [None]:
combined_df.info()

In [None]:
combined_df = combined_df.query("`Crime type` == 'Burglary'").copy()
combined_df[['year', 'month']] = combined_df['Month'].str.split('-', expand=True)
combined_df['year'] = pd.to_numeric(combined_df['year'], errors='coerce', downcast='integer')
combined_df['month'] = pd.to_numeric(combined_df['month'], errors='coerce', downcast='integer')
combined_df["Latitude"] = pd.to_numeric(combined_df["Latitude"], errors='coerce', downcast='float')
combined_df["Longitude"] = pd.to_numeric(combined_df["Longitude"], errors='coerce', downcast='float')
combined_df["Reported by"] = combined_df["Reported by"].astype("category")
combined_df["Falls within"] = combined_df["Falls within"].astype("category")
combined_df["Location"] = combined_df["Location"].astype("category") #but technically str
combined_df["LSOA code"] = combined_df["LSOA code"].astype("category") #but technically str
combined_df["Last outcome category"] = combined_df["Last outcome category"].astype("category") #but technically str
# LSOA code functionally determines the LSOA name
lookup = combined_df[["LSOA code", "LSOA name"]].drop_duplicates().reset_index(drop=True)
combined_df = combined_df.drop(columns=['Month', "LSOA name", "Context", "Crime type"], errors='ignore')

In [None]:
combined_df.info()

In [None]:
combined_df.isna().sum().sort_values() / len(combined_df) * 100

In [None]:
combined_df.to_parquet(
    os.path.join("data", "processed_data", "street.parquet"),
    index=False,
    engine="pyarrow"
)

# Outcomes

In [None]:
outcomes = read_city_of_london_street_csvs(os.path.join("data", "relevant_data"), "outcomes")


In [None]:
outcomes.info()

In [None]:
# combined_df = combined_df.query("`Crime type` == 'Burglary'").copy()
outcomes = outcomes[outcomes["Crime ID"].isin(combined_df["Crime ID"].unique())].copy()
outcomes[['year', 'month']] = outcomes['Month'].str.split('-', expand=True)
outcomes['year'] = pd.to_numeric(outcomes['year'], errors='coerce', downcast='integer')
outcomes['month'] = pd.to_numeric(outcomes['month'], errors='coerce', downcast='integer')
outcomes["Latitude"] = pd.to_numeric(outcomes["Latitude"], errors='coerce', downcast='float')
outcomes["Longitude"] = pd.to_numeric(outcomes["Longitude"], errors='coerce', downcast='float')
outcomes["Reported by"] = outcomes["Reported by"].astype("category")
outcomes["Falls within"] = outcomes["Falls within"].astype("category")
outcomes["Location"] = outcomes["Location"].astype("category") #but technically str
outcomes["LSOA code"] = outcomes["LSOA code"].astype("category") #but technically str
outcomes["Outcome type"] = outcomes["Outcome type"].astype("category") #but technically str
# LSOA code functionally determines the LSOA name
# lookup = outcomes[["LSOA code", "LSOA name"]].drop_duplicates().reset_index(drop=True)
outcomes = outcomes.drop(columns=['Month', "LSOA name"], errors='ignore')

In [None]:
outcomes.info()

In [None]:
outcomes.isna().sum().sort_values() / len(outcomes) * 100

In [None]:
outcomes.to_parquet(
    os.path.join("data", "processed_data", "outcomes.parquet"),
    index=False,
    engine="pyarrow"
)

# Search

In [None]:
search = read_city_of_london_street_csvs(os.path.join("data", "relevant_data"), "stop-and-search")

In [None]:
search.info()

In [None]:
search["Type"] = search["Type"].astype("category")
search["Date"] = pd.to_datetime(search["Date"], errors='coerce')
search["Latitude"] = pd.to_numeric(search["Latitude"], errors='coerce', downcast='float')
search["Longitude"] = pd.to_numeric(search["Longitude"], errors='coerce', downcast='float')
search["Gender"] = search["Type"].astype("category")
search["Age range"] = pd.Categorical(search["Age range"],
                                 categories=["under 10", "10-17", "18-24", "25-34", "over 34"],
                                 ordered=True)
search["Self-defined ethnicity"] = search["Self-defined ethnicity"].astype("category")
search["Officer-defined ethnicity"] = search["Officer-defined ethnicity"].astype("category")
search["Legislation"] = search["Legislation"].astype("category")
search["Object of search"] = search["Object of search"].astype("category")
search["Outcome"] = search["Outcome"].astype("category")
search["Outcome linked to object of search"] = search["Outcome linked to object of search"].astype("bool")
search["Outcome linked to object of search"] = search["Outcome linked to object of search"].astype("bool")
search["Person search"] = search["Type"].str.contains("Person", na=False).astype("bool")
search["Vehicle search"] = search["Type"].str.contains("Vehicle", na=False).astype("bool")
search = search.drop(columns=["Policing operation", "Type"], errors='ignore')

In [None]:
search.info()

In [None]:
search.isna().sum().sort_values() / len(search) * 100

In [None]:
search.to_parquet(
    os.path.join("data", "processed_data", "search.parquet"),
    index=False,
    engine="pyarrow"
)

# TO database

In [None]:
import pandas as pd
import sqlite3

def create_database_from_dataframes(crimes_df, outcomes_df, stop_search_df, db_path="police_data.db"):
    """
    Creates an SQLite database with three tables from given DataFrames.
    Adds auto-incremented primary keys to each table.

    Args:
        crimes_df (pd.DataFrame): DataFrame containing crimes.
        outcomes_df (pd.DataFrame): DataFrame containing outcomes.
        stop_search_df (pd.DataFrame): DataFrame containing stop and search records.
        db_path (str): Path where the SQLite database will be created.
    """
    # Connect to SQLite database (will create if it doesn't exist)
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Create Crimes table
    cursor.execute(f"""
    CREATE TABLE IF NOT EXISTS crimes (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        crime_id CHARACTER(60),
        reported_by ENUM({', '.join(f"'{cat}'" for cat in combined_df["Reported by"].cat.categories.tolist())}),
        falls_within ENUM({', '.join(f"'{cat}'" for cat in combined_df["Falls within"].cat.categories.tolist())}),
        point_location GEOGRAPHY(POINT, 4326),
        location ENUM({', '.join(f"'{cat}'" for cat in combined_df["Location"].cat.categories.tolist())}),
        lsoa_code CHARACTER(9),
        last_outcome_category ENUM({', '.join(f"'{cat}'" for cat in combined_df["Last outcome category"].cat.categories.tolist())}),
        year SMALLINT,
        month TINYINT
    );
    """)

    # Create Outcomes table
    cursor.execute(f"""
    CREATE TABLE IF NOT EXISTS outcomes (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        crime_id CHARACTER(60),
        reported_by ENUM({', '.join(f"'{cat}'" for cat in outcomes["Reported by"].cat.categories.tolist())}),
        falls_within ENUM({', '.join(f"'{cat}'" for cat in outcomes["Falls within"].cat.categories.tolist())}),
        point_location GEOGRAPHY(POINT, 4326),
        location ENUM({', '.join(f"'{cat}'" for cat in outcomes["Location"].cat.categories.tolist())}),
        lsoa_code CHAR(9),
        outcome_type ENUM({', '.join(f"'{cat}'" for cat in outcomes["Outcome type"].cat.categories.tolist())}),
        year SMALLINT,
        month TINYINT
    );
    """)

    # Create Stop and Search table
    cursor.execute(f"""
    CREATE TABLE IF NOT EXISTS stop_search (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        date TIMESTAMP,
        part_of_policing_operation BOOLEAN,
        point_location GEOGRAPHY(POINT, 4326),
        person_search BOOLEAN,
        vehicle_search BOOLEAN,
        gender ENUM({', '.join(f"'{cat}'" for cat in search["Gender"].cat.categories.tolist())}),
        age_range ENUM({', '.join(f"'{cat}'" for cat in search["Age range"].cat.categories.tolist())}),
        self_defined_ethnicity ENUM({', '.join(f"'{cat}'" for cat in search["Self-defined ethnicity"].cat.categories.tolist())}),
        officer_defined_ethnicity ENUM({', '.join(f"'{cat}'" for cat in search["Officer-defined ethnicity"].cat.categories.tolist())}),
        legislation ENUM({', '.join(f"'{cat}'" for cat in search["Legislation"].cat.categories.tolist())}),
        object_of_search ENUM({', '.join(f"'{cat}'" for cat in search["Object of search"].cat.categories.tolist())}),
        outcome ENUM({', '.join(f"'{cat}'" for cat in search["Outcome"].cat.categories.tolist())}),
        outcome_linked_to_object_of_search BOOLEAN,
        removal_of_more_than_outer_clothing TEXT
    );
    """)

    # Insert data into the tables
    # crimes_df.to_sql('crimes', conn, if_exists='append', index=False)
    # outcomes_df.to_sql('outcomes', conn, if_exists='append', index=False)
    # stop_search_df.to_sql('stop_search', conn, if_exists='append', index=False)

    # Commit and close connection
    conn.commit()
    conn.close()

# Example usage (replace these with your actual dataframes):
create_database_from_dataframes(combined_df, outcomes, search)

In [None]:
', '.join(f"'{cat}'" for cat in combined_df["Falls Wi"].cat.categories.tolist())

In [None]:
print(f"""
    CREATE TABLE IF NOT EXISTS crimes (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        crime_id CHARACTER(60),
        reported_by {', '.join(f"'{cat}'" for cat in combined_df["Reported by"].cat.categories.tolist())},
        falls_within {', '.join(f"'{cat}'" for cat in combined_df["Falls within"].cat.categories.tolist())},
        point_location GEOGRAPHY(POINT, 4326),
        location {', '.join(f"'{cat}'" for cat in combined_df["Location"].cat.categories.tolist())},
        lsoa_code CHARACTER(9),
        last_outcome_category {', '.join(f"'{cat}'" for cat in combined_df["Last outcome category"].cat.categories.tolist())},
        year SMALLINT,
        month TINYINT
    );
    """)