# Dashboard

In [57]:
import pandas as pd

# read the sports_titles_with_topics_ csv
df = pd.read_csv('sports_titles_with_topics.csv')

df.head()

Unnamed: 0,subreddit,upvotes,num_comments,created_utc,id,month,clean_title,dominant_topic
0,sports,142825,2372,2017-12-16 13:24:22,7k6zaz,2017-12,weightlifter promised wife win olympic gold me...,14
1,sports,110244,2772,2019-01-21 21:14:03,aif0h8,2019-01,ram assistant coach whose job make sure head c...,11
2,sports,104147,2092,2018-01-29 17:06:07,7ttqeu,2018-01,pelican fan snuck court warmups stretched put ...,13
3,sports,102686,1696,2020-11-10 18:53:30,jrqxz3,2020-11,jon rahm skip ball across pond,13
4,sports,102038,1569,2018-02-28 17:18:55,80xq68,2018-02,stay done tonight,12


In [58]:
# read the df_topics csv
df_topics = pd.read_csv("subreddit_topic_importance.csv")

df_topics.head()

Unnamed: 0,subreddit,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,CFB,0.046236,0.034767,0.061397,0.032813,0.038321,0.058231,0.038156,0.065722,0.048521,...,0.036519,0.063164,0.045773,0.062904,0.0629,0.083292,0.034039,0.035433,0.058218,0.059393
1,baseball,0.057579,0.035064,0.04844,0.040794,0.047658,0.072587,0.048682,0.059048,0.076136,...,0.033223,0.047246,0.047795,0.074065,0.044742,0.032853,0.038046,0.040083,0.065913,0.052814
2,formula1,0.067331,0.039944,0.050795,0.030731,0.061307,0.053969,0.060071,0.055326,0.041595,...,0.05863,0.052619,0.048923,0.055594,0.041844,0.030152,0.050066,0.042894,0.055885,0.060543
3,hockey,0.066856,0.03443,0.046387,0.037133,0.037892,0.070846,0.056118,0.059373,0.035195,...,0.036133,0.049185,0.044212,0.070839,0.055174,0.033455,0.040407,0.042832,0.071008,0.055077
4,mma,0.042971,0.043195,0.053467,0.037987,0.049,0.052911,0.049164,0.058757,0.033506,...,0.037982,0.045023,0.049766,0.05993,0.046902,0.029807,0.043225,0.045563,0.069962,0.110784


This is some preprocessing for the dashboard later.

In [59]:
import pandas as pd
import plotly.express as px
from dash import Dash, dcc, html, Input, Output
from wordcloud import WordCloud
import base64
from io import BytesIO
import datetime

# Load the CSVs
df = pd.read_csv("sports_titles_with_topics.csv")
df_topics = pd.read_csv("subreddit_topic_importance.csv")

# Convert timestamps to dates
df["created_utc"] = pd.to_datetime(df["created_utc"])
df["date"] = df["created_utc"].dt.date

# Unique subreddits
subreddits = sorted(df["subreddit"].unique())



# Important names to track for a graph in the dashboard (3 per sport)
names = [
    "lebron", "curry", "durant",         # NBA
    "ohtani", "judge", "trout",          # Baseball
    "mcdavid", "crosby", "ovechkin",     # Hockey
    "sanders", "penix", "nix",           # CFB (College Football)
    "messi", "ronaldo", "neymar",        # Soccer
    "mahomes", "burrow", "kelce",        # NFL
    "jordan", "ali", "bolt",             # Iconic Sports Figures
    "mcgregor", "khabib", "jones",       # MMA
    "nadal", "federer", "djokovic",      # Tennis
    "verstappen", "hamilton", "leclerc"  # Formula 1
]

# Create DataFrame with the evolution of mentions over time
evolution = pd.DataFrame()

for name in names:
    df[f"mention_{name}"] = df["clean_title"].str.contains(name, case=False, na=False).astype(int)
    monthly_mentions = df.groupby("month")[f"mention_{name}"].sum().reset_index()
    monthly_mentions["name"] = name
    monthly_mentions.rename(columns={f"mention_{name}": "mentions"}, inplace=True)
    evolution = pd.concat([evolution, monthly_mentions], ignore_index=True)

Now we create a list of some important events for every sport to see later in the dashboard

In [60]:
# Dictionary of important events by sport
important_events = {
    'nba': [
        {'name': 'Luka Doncic trade to Lakers', 'date': '2025-02-02'},
        {'name': 'NBA Finals 2019: Raptors Champions', 'date': '2019-06-13'},
        {'name': 'Kobe Bryant Death', 'date': '2020-01-26'},
        {'name': 'NBA Finals 2020: Lakers Champions in the Bubble', 'date': '2020-10-11'},
        {'name': 'James Harden Trade to the Nets', 'date': '2021-01-14'},
        {'name': 'NBA Finals 2021: Bucks Champions', 'date': '2021-07-20'},
        {'name': 'Damian Lillard Trade to the Bucks', 'date': '2023-09-27'}
    ],
    'baseball': [
        {'name': 'World Series 2019: Nationals Champions', 'date': '2019-10-30'},
        {'name': 'World Series 2020: Dodgers Champions', 'date': '2020-10-27'},
        {'name': 'Shohei Ohtani Debut as 2-Way Star', 'date': '2018-03-29'},
        {'name': 'Ichiro Suzuki Retirement', 'date': '2019-03-21'},
        {'name': 'Juan Soto Trade to Padres', 'date': '2022-08-02'},
        {'name': 'Aaron Judge Breaks HR Record in AL', 'date': '2022-10-04'},
        {'name': 'World Series 2021: Braves Champions', 'date': '2021-11-02'},
        {'name': 'World Series 2022: Astros Champions', 'date': '2022-11-05'},
        {'name': 'World Series 2023: Rangers Champions', 'date': '2023-11-01'},
        {'name': 'Shohei Ohtani Signs with Dodgers for $700M', 'date': '2023-12-11'}
    ],
    'hockey': [
        {'name': 'Stanley Cup Final 2019: Blues Champions', 'date': '2019-06-12'},
        {'name': 'Stanley Cup Final 2020: Lightning Champions', 'date': '2020-09-28'},
        {'name': 'Stanley Cup Final 2021: Lightning Repeat Title', 'date': '2021-07-07'},
        {'name': 'Henrik Lundqvist Retirement', 'date': '2020-09-30'},
        {'name': 'Stanley Cup 2022: Avalanche Champions', 'date': '2022-06-26'},
        {'name': 'Stanley Cup 2023: Golden Knights Champions', 'date': '2023-06-13'},
        {'name': 'Connor Bedard Draft', 'date': '2023-06-28'},
        {'name': 'Jack Eichel Traded to Vegas', 'date': '2021-11-04'},
        {'name': 'Stanley Cup 2024: Florida Panthers Champions', 'date': '2024-06-10'},
        {'name': 'Aleksander Barkov MVP of Playoffs', 'date': '2024-06-10'}
    ],
    'CFB': [
        {'name': 'National Championship 2019: Clemson Beats Alabama', 'date': '2019-01-07'},
        {'name': 'National Championship 2020: LSU Beats Alabama with Joe Burrow', 'date': '2020-01-13'},
        {'name': 'National Championship 2021: Alabama Beats Ohio State', 'date': '2021-01-11'},
        {'name': 'National Championship 2022: Georgia Beats Alabama', 'date': '2022-01-10'},
        {'name': 'National Championship 2023: Georgia Beats TCU', 'date': '2023-01-09'},
        {'name': 'National Championship 2024: Michigan Beats Washington', 'date': '2024-01-08'},
        {'name': 'Joe Burrow Wins the Heisman', 'date': '2019-12-14'},
        {'name': 'Nick Saban Retirement', 'date': '2024-01-10'},
        {'name': 'Caleb Williams Transfers to USC', 'date': '2022-01-03'},
        {'name': 'Lincoln Riley Leaves Oklahoma for USC', 'date': '2021-11-28'}
    ],
    'soccer': [
        {'name': 'Messi Leaves Barcelona', 'date': '2021-08-10'},
        {'name': 'Cristiano Ronaldo Returns to Manchester United', 'date': '2021-08-27'},
        {'name': '2022 World Cup: Argentina Champions', 'date': '2022-12-18'},
        {'name': 'Liverpool Wins Champions League 2019', 'date': '2019-06-01'},
        {'name': 'Bayern Wins Champions League 2020', 'date': '2020-08-23'},
        {'name': 'Chelsea Wins Champions League 2021', 'date': '2021-05-29'},
        {'name': 'European Super League Announced and Collapses', 'date': '2021-04-18'}
    ],
    'nfl': [
        {'name': 'Super Bowl 2019: Patriots Beat Rams', 'date': '2019-02-03'},
        {'name': 'Super Bowl 2020: Chiefs Beat 49ers', 'date': '2020-02-02'},
        {'name': 'Super Bowl 2021: Buccaneers Champions with Brady', 'date': '2021-02-07'},
        {'name': 'Super Bowl 2022: Rams Beat Bengals', 'date': '2022-02-13'},
        {'name': 'Super Bowl 2023: Chiefs Beat Eagles', 'date': '2023-02-12'},
        {'name': 'Super Bowl 2024: Chiefs Beat 49ers', 'date': '2024-02-11'},
        {'name': 'Tom Brady Retirement', 'date': '2023-02-01'},
        {'name': 'Joe Burrow Draft', 'date': '2020-04-23'},
        {'name': 'Trevor Lawrence Draft', 'date': '2021-04-29'},
        {'name': 'Aaron Rodgers Traded to Jets', 'date': '2023-04-24'}
    ],
    'sports': [
        {'name': 'Tokyo 2020 Olympic Games', 'date': '2021-07-23'},
        {'name': 'Simone Biles Temporarily Retires in Tokyo', 'date': '2021-07-27'},
        {'name': 'Novak Djokovic Almost Achieves Golden Slam', 'date': '2021-08-01'},
        {'name': 'Usain Bolt Retires', 'date': '2017-08-12'},
        {'name': 'Michael Phelps Retires', 'date': '2016-08-13'}
    ],
    'mma': [
        {'name': 'UFC 229: Khabib vs McGregor', 'date': '2018-10-06'},
        {'name': 'Khabib Retires Undefeated', 'date': '2020-10-24'},
        {'name': 'UFC 257: McGregor vs Poirier 2', 'date': '2021-01-23'},
        {'name': 'UFC 264: McGregor vs Poirier 3', 'date': '2021-07-10'},
        {'name': 'UFC 285: Jon Jones Returns and Wins', 'date': '2023-03-04'},
    ],
    'tennis': [
        {'name': 'Roland Garros Final 2020: Nadal Wins 13th Title', 'date': '2020-10-11'},
        {'name': 'Djokovic Wins Wimbledon 2021', 'date': '2021-07-11'},
        {'name': 'Carlos Alcaraz Wins US Open 2022', 'date': '2022-09-11'},
        {'name': 'Roger Federer Retirement', 'date': '2022-09-15'},
        {'name': 'Serena Williams Retirement', 'date': '2022-09-02'},
        {'name': 'Carlos Alcaraz Wins Wimbledon 2023', 'date': '2023-07-16'},
        {'name': 'Djokovic Wins Grand Slam 24', 'date': '2023-09-10'}
    ],
    'formula1': [
         {'name': 'Abu Dhabi GP 2021: Verstappen Wins Controversial Title', 'date': '2021-12-12'},
        {'name': 'Verstappen Title 2022', 'date': '2022-10-09'},
        {'name': 'Sebastian Vettel Retirement', 'date': '2022-11-20'},
        {'name': 'Lewis Hamilton Signs with Ferrari', 'date': '2024-02-01'},
        {'name': 'Inaugural Las Vegas GP', 'date': '2023-11-18'},
        {'name': 'Grosjean Accident in Bahrain', 'date': '2020-11-29'}
    ]
}


Finally, we intialize and see the dashboard

In [61]:
# Initialize app
app = Dash(__name__)

app.layout = html.Div([
    html.H1("Most Discussed Topics by Sport"),

    html.Div([
        html.Label("Select a sport:"),
        dcc.Dropdown(
            id="subreddit-dropdown",
            options=[{"label": sub, "value": sub} for sub in subreddits],
            value=subreddits[0]
        ),

        html.Label("Select a date range:"),
        dcc.DatePickerRange(
            id="date-range",
            start_date=df["date"].min(),
            end_date=df["date"].max(),
            display_format="YYYY-MM-DD",
            style={"marginBottom": "30px"}
        ),

        html.H3("Word Cloud for selected date range:"),
        html.Img(id="wordcloud-filtered", style={"width": "70%"})
    ], style={"marginBottom": "80px"}),

    html.H1("Topics by Highlighted Event"),
    html.Label("Select a sport:"),
    dcc.Dropdown(
        id="sport-dropdown",
        options=[{"label": k, "value": k} for k in important_events.keys()],
        value="nba"
    ),
    html.Label("Select an important event:"),
    dcc.Dropdown(id="event-dropdown"),
    html.H3("Word Cloud for selected event (from 1 week before to 3 weeks after):"),
    html.Img(id="wordcloud-event", style={"width": "70%"}),

    html.H1("Topic Distribution by Sport"),
    html.Label("Select a sport (or 'All') to view topic distribution:"),
    dcc.Dropdown(
        id="subreddit-topics-dropdown",
        options=[{"label": sub, "value": sub} for sub in subreddits] + [{"label": "All", "value": "All"}],
        value="All"
    ),
    dcc.Graph(id="topics-chart"),
    html.I("This chart shows which LDA topics appear as dominant in the selected sport's posts."),
    html.Br(), html.Br(),

    html.H1("Evolution of Popularity for Famous Athletes"),
    html.Label("Select names to visualize:"),
    dcc.Dropdown(
        id="dropdown-names",
        options=[{"label": name.title(), "value": name} for name in names],
        value=["messi", "lebron"],
        multi=True
    ),
    dcc.Graph(id="evolution-chart"),
    html.Br(), html.Br()

])

# Function to generate base64 image
def generate_wordcloud(text):
    if not text.strip():
        text = "No data"
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
    buffer = BytesIO()
    wordcloud.to_image().save(buffer, format="PNG")
    img_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
    return f"data:image/png;base64,{img_str}"

# Callback for word clouds by subreddit and date range
@app.callback(
    Output("wordcloud-filtered", "src"),
    Input("subreddit-dropdown", "value"),
    Input("date-range", "start_date"),
    Input("date-range", "end_date")
)
def update_wordclouds(subreddit, start_date, end_date):
    titles_all = df[df["subreddit"] == subreddit]["clean_title"].dropna()
    text_all = " ".join(titles_all.tolist())

    mask = (
        (df["subreddit"] == subreddit) &
        (df["date"] >= pd.to_datetime(start_date).date()) &
        (df["date"] <= pd.to_datetime(end_date).date())
    )
    titles_filtered = df.loc[mask, "clean_title"].dropna()
    text_filtered = " ".join(titles_filtered.tolist())

    return generate_wordcloud(text_filtered)

# Callback to update events based on sport
@app.callback(
    Output("event-dropdown", "options"),
    Output("event-dropdown", "value"),
    Input("sport-dropdown", "value")
)
def update_event_options(sport):
    events = important_events.get(sport, [])
    options = [{"label": e["name"], "value": e["date"]} for e in events]
    default_value = options[0]["value"] if options else None
    return options, default_value

# Callback for word cloud of the event
@app.callback(
    Output("wordcloud-event", "src"),
    Input("sport-dropdown", "value"),
    Input("event-dropdown", "value")
)
def update_event_wordcloud(sport, event_date):
    if not event_date:
        return generate_wordcloud("")

    event_date = pd.to_datetime(event_date).date()
    start_date = event_date
    end_date = event_date + datetime.timedelta(weeks=2)

    mask = (
        (df["subreddit"] == sport) &
        (df["date"] >= start_date) &
        (df["date"] <= end_date)
    )
    text = " ".join(df.loc[mask, "clean_title"].dropna().tolist())
    return generate_wordcloud(text)

# Histogram of dominant topic distribution
@app.callback(
    Output("topics-chart", "figure"),
    Input("subreddit-topics-dropdown", "value")
)
def update_topic_distribution(selected_subreddit):
    if selected_subreddit == "All":
        filtered_df = df_topics.copy()
        title = "Topic distribution in all sports"
    else:
        filtered_df = df_topics[df_topics["subreddit"] == selected_subreddit]
        title = f"Topic distribution in r/{selected_subreddit}"

    topic_cols = [col for col in filtered_df.columns if col.startswith("topic_")]

    topic_means = filtered_df[topic_cols].mean().reset_index()
    topic_means.columns = ["Topic", "Importance"]
    topic_means["Topic"] = topic_means["Topic"].str.replace("topic_", "")

    fig = px.bar(
        topic_means,
        x="Topic",
        y="Importance",
        title=title,
        color="Topic",
        color_discrete_sequence=px.colors.qualitative.Plotly
    )
    return fig


@app.callback(
    Output("evolution-chart", "figure"),
    Input("dropdown-names", "value")
)
def update_name_graph(selected_names):
    if not selected_names:
        return px.line(title="Select at least one name")

    filtered_data = evolution[evolution["name"].isin(selected_names)]
    fig = px.line(
    filtered_data,
    x="month",
    y="mentions",
    color="name",
    title="Monthly Evolution of Mentions by Athlete",
    markers=True
    )
    fig.update_layout(xaxis_title="Month", yaxis_title="Number of Mentions")
    return fig



# Execute the app
if __name__ == "__main__":
    app.run(debug=True)


<IPython.core.display.Javascript object>