In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

from jupyter_dash import JupyterDash
from dash import dcc, html, State
from dash.dependencies import Input, Output
import plotly.express as px  

from dash import dash_table

app = JupyterDash(__name__)

In [2]:
df = pd.read_csv('../Sentiment & Engagement Datasets/subreddit_flair_data.csv')

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,text,title,author,num_comments,post_id,upvote_ratio,score,url,subreddit,...,category_FinancialCareers,category_ITCareerQuestions,category_LegalAdviceOffTopic,category_careeradvice,category_careerguidance,category_cscareerquestions,category_jobs,category_resumes,category_sales,category_teachers
0,0,Hi context year old guy Amsterdam currently em...,Lazy job or Hard job?,Weak_Assumption_6889,8,1bfpxll,0.33,0,https://www.reddit.com/r/careeradvice/comments...,careeradvice,...,,,,General Advice,,,,,,
1,1,Looking new role havenut much traction Recentl...,Roast my Resume Pls,Neither_Trash,1,1bh8md2,0.99,1,https://i.redd.it/n918fjprlyoc1.jpeg,resumes,...,,,,,,,,Resume Review by Region,,


In [4]:
df.columns

Index(['Unnamed: 0', 'text', 'title', 'author', 'num_comments', 'post_id',
       'upvote_ratio', 'score', 'url', 'subreddit', 'link_flair_text',
       'link_flair_template_id', 'created_datetime', 'day_of_week',
       'hour_of_day', 'month', 'year', 'sentiment_score', 'log_num_comments',
       'log_score', 'outlier_num_comments', 'outlier_score', 'category_AskHR',
       'category_FinancialCareers', 'category_ITCareerQuestions',
       'category_LegalAdviceOffTopic', 'category_careeradvice',
       'category_careerguidance', 'category_cscareerquestions',
       'category_jobs', 'category_resumes', 'category_sales',
       'category_teachers'],
      dtype='object')

In [5]:
unique_subreddits = df['subreddit'].unique()

In [6]:
unique_subreddits

array(['careeradvice', 'resumes', 'ITCareerQuestions', 'FinancialCareers',
       'LegalAdviceOffTopic', 'teachers', 'AskHR', 'sales',
       'careerguidance', 'jobs', 'cscareerquestions'], dtype=object)

In [7]:
subreddit_flair_mapping = {
    'careeradvice': 'category_careeradvice',
    'resumes': 'category_resumes',
    'ITCareerQuestions': 'category_ITCareerQuestions',
    'FinancialCareers': 'category_FinancialCareers',
    'LegalAdviceOffTopic': 'category_LegalAdviceOffTopic',
    'teachers': 'category_teachers',
    'AskHR': 'category_AskHR',
    'sales': 'category_sales',
    'careerguidance': 'category_careerguidance',
    'jobs': 'category_jobs',
    'cscareerquestions': 'category_cscareerquestions'
}


In [8]:

# Function to apply consistent styling to all hists
def style_histogram(fig, title, xaxis_title, yaxis_title='Count'):
    color_discrete_sequence = px.colors.qualitative.Pastel1
    fig.update_traces(marker_line_width=1, marker_color=color_discrete_sequence[0], 
                      marker_line_color='blue')
    fig.update_layout(
        title_text=title,
        title_font_size=18,
        xaxis=dict(
            title=xaxis_title,
            showgrid=True,
            gridwidth=1,
            gridcolor='LightPink'
        ),
        yaxis=dict(
            title=yaxis_title,
            showgrid=True,
            gridwidth=1,
            gridcolor='LightPink'
        ),
        plot_bgcolor='rgba(0,0,0,0)',
        margin=dict(l=40, r=40, t=40, b=40),
        hoverlabel=dict(bgcolor="white", font_size=12, font_family="Rockwell")
    )
    return fig


In [9]:
#Layouts
app.layout = html.Div([
    html.H1("Subreddit to Flair - Engagement Distribution Analysis", style={'textAlign': 'center', 'marginBottom': '1em'}),
    html.H2("Subreddit-Level Engagement"),
    html.P("Select a subreddit from the dropdown to view the distribution of engagement metrics like the number of comments and score. Logarithmic transformations are applied for a better visual representation of the distribution, and outlier detection highlights posts with unusually high engagement."),

    html.Div([
        html.Label("Select a Subreddit:", style={'fontSize': '18px', 'marginTop': '10px'}),
    
        dcc.Dropdown(
            id='subreddit-dropdown',
            options=[{'label': subreddit, 'value': subreddit} for subreddit in unique_subreddits],
            value=unique_subreddits[0],  
            clearable=False,
            searchable=True,
            placeholder="Select a subreddit",
        ),
    ], style={'width': '30%', 'display': 'inline-block'}),
    
    html.Div([
        html.Label("Choose a Visualization Type:", style={'fontSize': '18px', 'marginTop': '10px'}),
        dcc.Dropdown(
            id='visual-dropdown',
            options=[
                {'label': 'Histogram', 'value': 'hist'},
                {'label': 'Violin Plot', 'value': 'violin'}
            ],
            value='hist',  
            clearable=False,
            searchable=False,
            placeholder="Select a visual type",
        ),
    ], style={'width': '30%', 'display': 'inline-block', 'marginLeft': '20px'}),

    # Subreddit-level visualization container
    html.Div(id='subreddit-visualization-container'),

    # Flair selection and visualization only appears after a subreddit is selected
    html.H2("Flair-Level Engagement"),
    html.P("After selecting a subreddit, choose a flair to drill down into the engagement metrics specific to different content categories within the subreddit. This can provide insights into what types of posts generate the most discussion or receive the highest scores."),

    html.Div([
        html.Label("Select a Flair:", style={'fontSize': '18px', 'marginTop': '10px'}),
        dcc.Dropdown(
            id='flair-dropdown',
            options=[],
            placeholder="Select a flair",
            searchable=True,
            clearable=True  
        ),
    ], style={'width': '30%', 'display': 'none', 'marginLeft': '20px'}, id='flair-dropdown-container'),

    # Flair-level visualization container
    html.Div(id='flair-visualization-container')
])


@app.callback(
    [Output('flair-dropdown', 'options'),
     Output('flair-dropdown-container', 'style')],
    [Input('subreddit-dropdown', 'value')]
)
def update_flair_dropdown_options(subreddit):
    if not subreddit or subreddit not in subreddit_flair_mapping:
       
        return [], {'display': 'none'}
    
    # Use the mapping to get the correct flair column for the selected subreddit
    flair_column = subreddit_flair_mapping.get(subreddit)
    if flair_column:
        # Extract unique flairs from this column
        unique_flairs = df[flair_column].dropna().unique()
        flair_options = [{'label': flair, 'value': flair} for flair in unique_flairs]
        
        # Return the options for the dropdown and make the container visible
        return flair_options, {'display': 'block', 'width': '30%', 'marginLeft': '20px'}
    else:
        
        return [], {'display': 'none'}


    
#subreddit level callback setup    
@app.callback(
    Output('subreddit-visualization-container', 'children'),
    [Input('subreddit-dropdown', 'value'),
     Input('visual-dropdown', 'value')]
)
def update_subreddit_visuals(subreddit, visual_type):
    
    filtered_df = df[df['subreddit'] == subreddit]
    
    if visual_type == 'hist':
        fig_log_comments = style_histogram(
            px.histogram(filtered_df, x='log_num_comments'),
            'Log of Number of Comments',
            'Logarithmic Number of Comments'
        )
        fig_log_score = style_histogram(
            px.histogram(filtered_df, x='log_score'),
            'Log of Score',
            'Logarithmic Score'
        )
        
    elif visual_type == 'violin':
        fig_log_comments = px.violin(filtered_df, y='log_num_comments', box=True, points="all", title='Log of Number of Comments')
        fig_log_score = px.violin(filtered_df, y='log_score', box=True, points="all", title='Log of Score')
    # Prepare visuals for outliers

    outliers_comments = filtered_df[filtered_df['outlier_num_comments']]
    outliers_score = filtered_df[filtered_df['outlier_score']]
    
    if visual_type == 'hist':
        fig_outliers_comments = style_histogram(
            px.histogram(filtered_df[filtered_df['outlier_num_comments']], x='num_comments'),
            'Outliers: Number of Comments',
            'Number of Comments'
        )
        fig_outliers_score = style_histogram(
            px.histogram(filtered_df[filtered_df['outlier_score']], x='score'),
            'Outliers: Score',
            'Score'
        )
        
    elif visual_type == 'violin':
        fig_outliers_comments = px.violin(outliers_comments, y='num_comments', box=True, points="all", title='Outliers: Number of Comments')
        fig_outliers_score = px.violin(outliers_score, y='score', box=True, points="all", title='Outliers: Score')



    return html.Div([
        html.Div([dcc.Graph(figure=fig_log_comments)], style={'display': 'inline-block', 'width': '50%'}),
        html.Div([dcc.Graph(figure=fig_outliers_comments)], style={'display': 'inline-block', 'width': '50%'}),
        html.Div([dcc.Graph(figure=fig_log_score)], style={'display': 'inline-block', 'width': '50%'}),
        html.Div([dcc.Graph(figure=fig_outliers_score)], style={'display': 'inline-block', 'width': '50%'})
    ])



#flair_level callback setup
@app.callback(
    Output('flair-visualization-container', 'children'),
    [Input('subreddit-dropdown', 'value'),
     Input('flair-dropdown', 'value')]
)
def update_flair_visuals(subreddit, flair):
    # Check if a subreddit and flair are selected
    if not subreddit or not flair:
       
        return []

    # Use the mapping to get the correct flair column for the selected subreddit
    flair_column = subreddit_flair_mapping.get(subreddit)
    
    # Filter DataFrame based on selected flair within the selected subreddit
    if flair_column:
        filtered_df = df[df[flair_column] == flair]
    else:
        
        return []

    # hists
    fig_log_comments = style_histogram(
        px.histogram(filtered_df, x='log_num_comments'),
        f'Log of Number of Comments for {flair} in {subreddit}',
        'Logarithmic Number of Comments'
    )
    fig_log_score = style_histogram(
        px.histogram(filtered_df, x='log_score'),
        f'Log of Score for {flair} in {subreddit}',
        'Logarithmic Score'
    )
    fig_outliers_comments = style_histogram(
        px.histogram(filtered_df[filtered_df['outlier_num_comments']], x='num_comments'),
        f'Outliers: Number of Comments for {flair} in {subreddit}',
        'Number of Comments'
    )
    fig_outliers_score = style_histogram(
        px.histogram(filtered_df[filtered_df['outlier_score']], x='score'),
        f'Outliers: Score for {flair} in {subreddit}',
        'Score'
    )

    
    return html.Div([
        html.Div([dcc.Graph(figure=fig_log_comments)], style={'display': 'inline-block', 'width': '50%'}),
        html.Div([dcc.Graph(figure=fig_outliers_comments)], style={'display': 'inline-block', 'width': '50%'}),
        html.Div([dcc.Graph(figure=fig_log_score)], style={'display': 'inline-block', 'width': '50%'}),
        html.Div([dcc.Graph(figure=fig_outliers_score)], style={'display': 'inline-block', 'width': '50%'})
    ])

if __name__ == '__main__':
    app.run_server(mode='inline')