In [1]:
import pandas as pd

import matplotlib.pyplot as plt

from jupyter_dash import JupyterDash
from dash import dcc, html, State
from dash.dependencies import Input, Output
import plotly.express as px  

from dash import dash_table

app = JupyterDash(__name__)

In [2]:
df = pd.read_csv('../Sentiment & Engagement Datasets/ready_data_score.csv')

In [3]:
df.head(2)

Unnamed: 0,text,title,author,num_comments,post_id,upvote_ratio,score,url,subreddit,link_flair_text,link_flair_template_id,created_datetime,day_of_week,hour_of_day,month,year,sentiment_score
0,Hi context year old guy Amsterdam currently em...,Lazy job or Hard job?,Weak_Assumption_6889,8,1bfpxll,0.33,0,https://www.reddit.com/r/careeradvice/comments...,careeradvice,Unknown,Unknown,2024-03-15 22:07:22,Friday,22,March,2024,0.7579
1,Looking new role havenut much traction Recentl...,Roast my Resume Pls,Neither_Trash,1,1bh8md2,0.99,1,https://i.redd.it/n918fjprlyoc1.jpeg,resumes,Review my resume • I'm in North America,c292b8e0-28b9-11ec-874c-325b17e851a3,2024-03-17 21:05:40,Sunday,21,March,2024,0.6369


In [5]:
time_features = ['day_of_week', 'hour_of_day', 'month', 'year']


for feature in time_features:
    print(f"Value counts for {feature}:")
    print(df[feature].value_counts())
    print("\n") 

Value counts for day_of_week:
day_of_week
Thursday     1390
Friday       1313
Wednesday    1154
Sunday       1116
Saturday     1100
Tuesday       919
Monday        768
Name: count, dtype: int64


Value counts for hour_of_day:
hour_of_day
21    489
18    484
16    478
19    474
17    471
15    433
20    421
22    399
23    371
1     367
14    365
0     351
2     343
13    306
3     289
4     276
5     231
12    228
11    204
6     187
10    165
8     158
7     141
9     129
Name: count, dtype: int64


Value counts for month:
month
March        6441
February      504
January       206
August         94
June           82
July           81
May            77
December       74
November       54
October        50
April          49
September      48
Name: count, dtype: int64


Value counts for year:
year
2024    7063
2023     367
2022     119
2021      89
2020      71
2019      26
2018      13
2017      10
2016       2
Name: count, dtype: int64




features including in this component: day_of_week, hour_of_day, month, year, sentiment_score 

Question: How do sentiment scores vary by time of day, day of the week, month, or year?

In [4]:
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

months_order = ['January', 'February', 'March', 'April', 'May', 'June', 
                'July', 'August', 'September', 'October', 'November', 'December']

#the layout
app.layout = html.Div([
    html.H1("Sentiment Score Distribution Analysis", style={'textAlign': 'center'}),
    
    html.Div([
        html.H2("Filter by:"),
        html.Div([
            html.H3("Year"),
            dcc.Checklist(
                id='year-checklist',
                options=[{'label': str(year), 'value': year} for year in sorted(df['year'].unique())],
                value=[],
                labelStyle={'display': 'block'}
            ),
        ]),
        html.Div([
            html.H3("Month"),
            dcc.Checklist(
                id='month-checklist',
                options=[{'label': month, 'value': month} for month in months_order if month in df['month'].unique()],
                value=[],
                labelStyle={'display': 'block'}
            ),
        ]),
        html.Div([
            html.H3("Day of the Week"),
            dcc.Checklist(
                id='day-checklist',
                options=[{'label': day, 'value': day} for day in order],
                value=[],
                labelStyle={'display': 'block'}
            ),
        ]),
    ], style={'width': '20%', 'display': 'inline-block', 'verticalAlign': 'top'}),

    html.Div([
         html.H2("Sentiment Score Distribution by Time of Day", style={'textAlign': 'center'}),
         html.P("This plot displays the spread of sentiment scores across different hours of the day. Use the filters to customize the data view by year, month, and day of the week.", style={'textAlign': 'center'}),
        dcc.Graph(id='sentiment-distribution-plot'),
    ], style={'width': '80%', 'display': 'inline-block'}),
])

# the callback
@app.callback(
    Output('sentiment-distribution-plot', 'figure'),
    [Input('year-checklist', 'value'),
     Input('month-checklist', 'value'),
     Input('day-checklist', 'value')]
)

#the function
def update_graph(selected_years, selected_months, selected_days):
    
    filtered_df = df[df['year'].isin(selected_years) & df['month'].isin(selected_months) & df['day_of_week'].isin(selected_days)]
    
    
    fig = px.box(filtered_df, x='hour_of_day', y='sentiment_score',

                 points="all", 
                 custom_data=['created_datetime', 'day_of_week', 'year', 'month'])
    #hover data
    fig.update_traces(
    hovertemplate="<br>".join([
        "Datetime: %{customdata[0]}",
        "Day: %{customdata[1]}",
        "Year: %{customdata[2]}",
        "Month: %{customdata[3]}",
        "Hour: %{x}",
        "Sentiment: %{y:.2f}"
    ]),
        marker=dict(color='LightSkyBlue', outliercolor='rgba(219, 64, 82, 0.6)', 
                line=dict(outliercolor='rgba(219, 64, 82, 0.6)', outlierwidth=2)),
    boxmean='sd'
        
    )
    
    fig.update_layout(
    xaxis=dict(title='Hour of Day', showgrid=True, gridwidth=1, gridcolor='LightPink'),
    yaxis=dict(title='Sentiment Score', showgrid=True, gridwidth=1, gridcolor='LightPink'),
    paper_bgcolor='rgb(243, 243, 243)',  
    plot_bgcolor='rgb(243, 243, 243)',  
    showlegend=False,
    width=800,
    height=600, 
)
   
    fig.update_yaxes(zeroline=True, zerolinewidth=2, zerolinecolor='LightPink')
    
    
    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8051)
