# Topic Modeling + Plots for Paper

In [24]:
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Times New Roman"

## Setup

In [44]:
import pandas as pd
from datetime import date
from pathlib import Path
import matplotlib.ticker as mtick
import numpy as np
from scipy.stats import entropy
import seaborn as sns
import opinionated  # noqa
import matplotlib.pyplot as plt
import os


import matplotlib.pyplot as plt



import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

CITY_SHORTCODE_NAME_LUT = {
    "AA": "Ann Arbor, MI",
    "RO": "Royal Oak, MI",
    "JS": "Jackson, MI",
    "LS": "Lansing, MI",
    "RCH": "Richmond, VA",
    "SEA": "Seattle, WA",
    "OAK": "Oakland, CA",
}

CITIES_OF_INTEREST = [
    "AA",
    "RO",
    "JS",
    "LS",
    "SEA",
    "OAK",
    "RCH",
]

CITY_ORDER = [
    "Seattle, WA",  # 737,015
    "Oakland, CA",  # 440,646
    "Richmond, VA",  # ...
    "Ann Arbor, MI",  # 123,851
    "Lansing, MI",  # 112,644
    "Royal Oak, MI",  # 58,211
    "Jackson, MI",  # 31,309
]

TOPIC_SEEDS = {
    "Housing": [
        "zoning",
        "construction",
        "redevelopment",
        "growth",
        "planning",
        "housing",
        "rent",
        "single family",
        "duplex",
        "apartment",
        "subdivision",
        "renting",
        "rental",
        "landlord",
        "tenant",
        "property",
    ],
    "Transportation": [
        "public transit",
        "traffic",
        "bus",
        "car",
        "bike lanes",
        "pedestrian",
        "parking",
        "crosswalk",
    ],
    "Law Enforcement": [
        "police",
        "crime",
        "emergency",
        "safety",
        "property",
        "theft",
        "violence",
        "gun",
        "PD",
    ],
    "Sustainability": [
        "climate",
        "green",
        "conservation",
        "energy",
        "solar",
        "carbon",
        "pollinators",
        "mow",
        "flood",
        "drought",
        "fire",
    ],
    "Homelessness": [
        "homeless",
        "eviction",
        "shelter",
        "outreach",
        "mental health",
        "substance abuse",
        "housing",
    ],
    "Recreation": [
        "parks",
        "outdoors",
        "community",
        "events",
        "greenspace",
        "tree",
        "playground",
    ],
    "Economic Development": [
        "business",
        "jobs",
        "tax",
        "revitalization",
        "store",
        "main street",
        "shops",
        "local",
        "cannabis",
    ],
    "Arts & Culture": [
        "events",
        "festivals",
        "museums",
        "performances",
        "sculpture",
        "public art",
        "mural",
        "art",
    ],
    "Education &  Services": [
        "schools",
        "libraries",
        "programs",
        "youth",
        "kids",
        "students",
        "teaching",
        "training",
        "games",
        "sports",
    ],
    "Civic Engagement": [
        "transparency",
        "public participation",
        "elections",
        "accountability",
        "mayor",
        "council",
    ],
    "Israel-Palestine": [
        "Israel",
        "Palestine",
        "genocide",
        "Hamas",
        "Jewish",
        "Muslim",
        "discrimination",
        "Gaza",
        "ceasefire",
    ],
    "Police Reform": [
        "accountability",
        "community oversight",
        "training",
        "defund",
        "reform",
        "police",
        "traffic stops",
        "cops",
        "law",
    ],
    "Utilities": [
        "water",
        "electricity",
        "sewage",
        "internet",
        "utilities",
        "services",
        "DTE",
        "waste",
        "outage",
        "disruption",
        "trees",
        "storm",
        "rates",
        "shutoffs",
        "recycling",
    ],
    "Community Organizing": [
        "community",
        "services",
        "access",
        "better",
        "organizing",
        "events",
        "accountability ",
        "accountable",
        "help",
        "youth",
        "organization",
        "funding",
        "funds",
        "protect",
        "preserve",
        "group ",
        "petition",
    ],
    "Urban Development": [
        "beautification",
        "historic projects",
        "district",
        "area",
        "history",
        "preservation",
        "development",
        "coliseum",
        "scenic",
        "holiday",
        "lights",
        "tourist",
    ],
}

ANNOTATIONS_DIR = Path("/home/shared/local_gov/assigned_back_data_new/").resolve()


In [45]:
FULL_TOPIC_CLASSIFIED_COMMENTS_PATH = Path(
    "full-comment-data-with-topics.csv"
).resolve()

In [46]:
topics = pd.read_csv("full-comment-data-with-topics.csv")

In [51]:
from matplotlib import rc

In [53]:
# Get the bold color map values
# PALETTE_BOLD = cmaps.bold._colors
COLORBREWER_PALETTE = np.array(
    [
        [27, 158, 119],  # green
        [217, 95, 2],  # orange
        [117, 112, 179],  # purple
    ]
)
COLORBREWER_PALETTE = COLORBREWER_PALETTE / 255
sns.set_palette(COLORBREWER_PALETTE)

## Data Cleaning and Subsetting

In [54]:
# Store all data to single object
data_dfs = []


def split_short_name_to_city_and_date(short_name: str) -> tuple[str, date]:
    # Split the short name into city and date
    short_code_and_date_parts = short_name.split("_")

    # Short code is the first part
    short_code = short_code_and_date_parts[0]

    # Date is the rest in month day two-digit-year format
    event_date = date(
        year=int("20" + short_code_and_date_parts[-1]),
        month=int(short_code_and_date_parts[1]),
        day=int(short_code_and_date_parts[2]),
    )

    return short_code, event_date


# Read all data
for filepath in ANNOTATIONS_DIR.glob("*.csv"):
    # Read the comment data
    df = pd.read_csv(filepath)

    # Lowercase all columns
    df.columns = df.columns.str.lower()

    # Remove any spaces from column names and replace with "_"
    df.columns = df.columns.str.replace(" ", "_")

    # Split the "name" column into "city_short_code" and "date"
    df["city_short_code"], df["date"] = zip(
        *df["name"].apply(split_short_name_to_city_and_date),
        strict=True,
    )

    # Add the city name
    df["city_name"] = df["city_short_code"].map(CITY_SHORTCODE_NAME_LUT)

    # Using the filename, mark if this was a "training" or "inferred" dataset
    df["dataset_portion"] = filepath.stem.split("_")[-1]

    # Add the truth data to the list
    data_dfs.append(df)

# Concatenate all training data
full_data = pd.concat(data_dfs)

# Replace dataset portion with standard names
full_data["dataset_portion"] = full_data["dataset_portion"].replace(
    {"truth": "Training", "pred": "Inferred", "val": "Validation"}
)

# Subset the data to only the columns we care about
full_data = full_data[
    [
        "city_short_code",
        "city_name",
        "date",
        "dataset_portion",
        "meeting_section",
        "speaker_role",
        "start",
        "end",
        "model_pred_public_comment",
    ]
]

# Filter to only the cities of interest
full_data = full_data[full_data["city_short_code"].isin(CITIES_OF_INTEREST)]

#full_data.sample(3)

In [55]:
full_data['dataset_portion'].unique()

array(['Training', 'Validation', 'Inferred'], dtype=object)

## Dataset Distribution Metrics

In [56]:
def get_frac(df):
    if df['city_short_code'].iloc[0]=='RCH':
        return pd.NA
    x=df[df['dataset_portion']=='Inferred']
    y=x[(x['meeting_section']=='Public Comment')&(x['speaker_role']=='Commenter')]
    return y.shape[0]/x.shape[0]
full_data.groupby('city_short_code').apply(get_frac).mean()

0.09624591978670455

In [57]:
# Compute metrics on a per-meeting basis (city-name + date tuple)
metrics_list = []
for (city_name, meeting_date), group in full_data.groupby(["city_name", "date"]):
    # Get the total number of comments for this city and date
    public_comments_count = len(
        full_data[
            (full_data["city_name"] == city_name)
            & (full_data["date"] == meeting_date)
            & (full_data["meeting_section"] == "Public Comment")
            & (full_data["speaker_role"] == "Commenter")
        ]
    )

    # Get the total number of comments for this city
    total_city_public_comments_count = len(
        full_data[
            (full_data["city_name"] == city_name)
            & (full_data["meeting_section"] == "Public Comment")
            & (full_data["speaker_role"] == "Commenter")
        ]
    )

    # Calculate the percent of comments
    percent_of_public_comments = (
        public_comments_count / total_city_public_comments_count
    ) * 100

    # Get total utterances for meeting
    utterances_count = len(
        full_data[
            (full_data["city_name"] == city_name) & (full_data["date"] == meeting_date)
        ]
    )

    # Calculate percent of utterances are public comment
    percent_of_utterances = (public_comments_count / utterances_count) * 100

    # Add to the list
    metrics_list.append(
        {
            "city_name": city_name,
            "Date": meeting_date,
            "Dataset Portion": group.iloc[0]["dataset_portion"],
            "Public Comments": public_comments_count,
            "percent_of_total_comments": percent_of_public_comments,
            "Utterances": utterances_count,
            "percent_of_meeting_utterances": percent_of_utterances,
        }
    )

# Convert to dataframe
per_meeting_metrics_df = pd.DataFrame(metrics_list)
per_meeting_metrics_df = per_meeting_metrics_df.sort_values("Date", ascending=True)

## Topic Modeling

In [62]:
# Read the full comment data with topics
plotting_comments = pd.read_csv(FULL_TOPIC_CLASSIFIED_COMMENTS_PATH)

# Filter out any non-approved topics
plotting_comments = plotting_comments[
    plotting_comments["topic"].isin(TOPIC_SEEDS.keys())
]

# Create palette for heatmaps
heatmap_palette = np.array(
    [
        [255, 255, 229],
        [255, 247, 188],
        [254, 227, 145],
        [254, 196, 79],
        [254, 153, 41],
        [236, 112, 20],
        [204, 76, 2],
        [153, 52, 4],
        [102, 37, 6],
    ]
)

heatmap_palette = (heatmap_palette / 255).tolist()

### Topic Distribution of Comments Overall

In [1]:
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

# Create topic name and city count dataframe
topic_city_counts = (
    plotting_comments.groupby(["city_name", "topic"])
    .size()
    .reset_index(name="comment_count")
)

# Convert to percentage and store in new frame
topic_city_counts["comment_pct"] = topic_city_counts.groupby("city_name")[
    "comment_count"
].transform(lambda x: (x / x.sum()) * 100)

# For each city and each topic, if it doesn't exist, add a 0 count
for city in CITY_ORDER:
    for topic in TOPIC_SEEDS.keys():
        if not topic_city_counts[
            (topic_city_counts["city_name"] == city)
            & (topic_city_counts["topic"] == topic)
        ].empty:
            continue

        topic_city_counts = pd.concat(
            [
                topic_city_counts,
                pd.DataFrame(
                    {
                        "city_name": [city],
                        "topic": [topic],
                        "comment_count": [0],
                        "comment_pct": [0],
                    }
                ),
            ]
        )

# Always order data by population
topic_city_counts["city_name"] = pd.Categorical(
    topic_city_counts["city_name"], categories=CITY_ORDER, ordered=True
)
fig = plt.figure(figsize=(12.0, 8.0))
# Heatmap of topic percentage per month per city
ax = sns.heatmap(
    data=topic_city_counts.pivot(
        index="topic", columns="city_name", values="comment_pct"
    ),
    annot=True,
    fmt=".1f",
    cmap=heatmap_palette,
    cbar_kws={"format": mtick.PercentFormatter(decimals=0)},
        annot_kws={"fontsize":18}
)

# Remove x and y axis labels
ax.set_xlabel("")
ax.set_ylabel("")

_ = plt.xticks(rotation=45, ha="right")

plt.tight_layout()
#Uncomment to save
#plt.savefig("overall_topics.pdf", format="pdf", dpi=1000)

NameError: name 'plt' is not defined

### Topic Distribution of Inferred (True) Comments

In [3]:
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

inferred_true_comments = plotting_comments[
    plotting_comments["dataset_portion"] == "Inferred"
]

# Create topic name and city count dataframe
topic_city_counts = (
    inferred_true_comments.groupby(["city_name", "topic"])
    .size()
    .reset_index(name="comment_count")
)

# Convert to percentage and store in new frame
topic_city_counts["comment_pct"] = topic_city_counts.groupby("city_name")[
    "comment_count"
].transform(lambda x: (x / x.sum()) * 100)

# For each city and each topic, if it doesn't exist, add a 0 count
for city in CITY_ORDER:
    for topic in TOPIC_SEEDS.keys():
        if not topic_city_counts[
            (topic_city_counts["city_name"] == city)
            & (topic_city_counts["topic"] == topic)
        ].empty:
            continue

        topic_city_counts = pd.concat(
            [
                topic_city_counts,
                pd.DataFrame(
                    {
                        "city_name": [city],
                        "topic": [topic],
                        "comment_count": [0],
                        "comment_pct": [0],
                    }
                ),
            ]
        )

# Always order data by population
topic_city_counts["city_name"] = pd.Categorical(
    topic_city_counts["city_name"], categories=CITY_ORDER, ordered=True
)
fig = plt.figure(figsize=(12.0, 8.0))
# Heatmap of topic percentage per month per city
ax = sns.heatmap(
    data=topic_city_counts.pivot(
        index="topic", columns="city_name", values="comment_pct"
    ),
    annot=True,
    fmt=".1f",
    cmap=heatmap_palette,
    cbar_kws={"format": mtick.PercentFormatter(decimals=0)},
     annot_kws={"fontsize":18}
)

# Remove x and y axis labels
ax.set_xlabel("")
ax.set_ylabel("")

_ = plt.xticks(rotation=45, ha="right")

plt.tight_layout()
#Uncomment to save
#plt.savefig("topics_ground_truth.pdf", format="pdf", dpi=1000)

NameError: name 'plt' is not defined

### Topic Distribution of Inferred (Inferred) Comments

In [4]:
# Join the topic data back with the original data
# Join on the name and the start and end times

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

inferred_inferred_comments = plotting_comments[
    plotting_comments["dataset_portion"] == "Inferred"
].merge(
    full_data[["city_name", "start", "end", "model_pred_public_comment"]],
    on=["city_name", "start", "end"],
)

# Select down to only the comments that were predicted to be public comments
inferred_inferred_comments = inferred_inferred_comments[
    inferred_inferred_comments["model_pred_public_comment"] == 1
]

# Remove non-approved topics
inferred_inferred_comments = inferred_inferred_comments[
    inferred_inferred_comments["topic"].isin(TOPIC_SEEDS.keys())
]

# Create topic name and city count dataframe
topic_city_counts = (
    inferred_inferred_comments.groupby(["city_name", "topic"])
    .size()
    .reset_index(name="comment_count")
)

# Convert to percentage and store in new frame
topic_city_counts["comment_pct"] = topic_city_counts.groupby("city_name")[
    "comment_count"
].transform(lambda x: (x / x.sum()) * 100)

# For each city and each topic, if it doesn't exist, add a 0 count
for city in CITY_ORDER:
    for topic in TOPIC_SEEDS.keys():
        if not topic_city_counts[
            (topic_city_counts["city_name"] == city)
            & (topic_city_counts["topic"] == topic)
        ].empty:
            continue

        topic_city_counts = pd.concat(
            [
                topic_city_counts,
                pd.DataFrame(
                    {
                        "city_name": [city],
                        "topic": [topic],
                        "comment_count": [0],
                        "comment_pct": [0],
                    }
                ),
            ]
        )

# Always order data by population
topic_city_counts["city_name"] = pd.Categorical(
    topic_city_counts["city_name"], categories=CITY_ORDER, ordered=True
)
fig = plt.figure(figsize=(12.0, 8.0))
# Heatmap of topic percentage per month per city
ax = sns.heatmap(
    data=topic_city_counts.pivot(
        index="topic", columns="city_name", values="comment_pct"
    ),
    annot=True,
    fmt=".1f",
    cmap=heatmap_palette,
    cbar_kws={"format": mtick.PercentFormatter(decimals=0)},
     annot_kws={"fontsize":18}
)

# Remove x and y axis labels
ax.set_xlabel("")
ax.set_ylabel("")

_ = plt.xticks(rotation=45, ha="right")

plt.tight_layout()

#plt.savefig("topics_inferred.pdf", format="pdf", dpi=1000)

NameError: name 'plt' is not defined