In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('../Sentiment & Engagement Datasets/ready_data_score.csv')

In [3]:
df.head(2)

Unnamed: 0,text,title,author,num_comments,post_id,upvote_ratio,score,url,subreddit,link_flair_text,link_flair_template_id,created_datetime,day_of_week,hour_of_day,month,year,sentiment_score
0,Hi context year old guy Amsterdam currently em...,Lazy job or Hard job?,Weak_Assumption_6889,8,1bfpxll,0.33,0,https://www.reddit.com/r/careeradvice/comments...,careeradvice,Unknown,Unknown,2024-03-15 22:07:22,Friday,22,March,2024,0.7579
1,Looking new role havenut much traction Recentl...,Roast my Resume Pls,Neither_Trash,1,1bh8md2,0.99,1,https://i.redd.it/n918fjprlyoc1.jpeg,resumes,Review my resume • I'm in North America,c292b8e0-28b9-11ec-874c-325b17e851a3,2024-03-17 21:05:40,Sunday,21,March,2024,0.6369


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7760 entries, 0 to 7759
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   text                    7760 non-null   object 
 1   title                   7760 non-null   object 
 2   author                  7760 non-null   object 
 3   num_comments            7760 non-null   int64  
 4   post_id                 7760 non-null   object 
 5   upvote_ratio            7760 non-null   float64
 6   score                   7760 non-null   int64  
 7   url                     7760 non-null   object 
 8   subreddit               7760 non-null   object 
 9   link_flair_text         7760 non-null   object 
 10  link_flair_template_id  7760 non-null   object 
 11  created_datetime        7760 non-null   object 
 12  day_of_week             7760 non-null   object 
 13  hour_of_day             7760 non-null   int64  
 14  month                   7760 non-null   

### Engagement Metrics by Sentiment Score Categories

In [5]:
df['sentiment_score'].describe()

count    7760.000000
mean        0.381291
std         0.612607
min        -0.999300
25%         0.000000
50%         0.624900
75%         0.900100
max         0.999900
Name: sentiment_score, dtype: float64

In [6]:
def categorize_sentiment(score):
    if score > 0.5:
        return 'Positive'
    elif score < -0.2:
        return 'Negative'
    else:
        return 'Neutral'

# apply this function to the df
df['sentiment_category'] = df['sentiment_score'].apply(categorize_sentiment)


df['sentiment_category'].value_counts()


sentiment_category
Positive    4367
Neutral     1835
Negative    1558
Name: count, dtype: int64

In [7]:

engagement_metrics_by_sentiment = df.groupby('sentiment_category').agg({
    'num_comments': 'mean',
    'upvote_ratio': 'mean',
    'score': 'mean'
}).reset_index()

engagement_metrics_by_sentiment


Unnamed: 0,sentiment_category,num_comments,upvote_ratio,score
0,Negative,100.143774,0.771919,540.503209
1,Neutral,53.135695,0.786954,231.724796
2,Positive,55.738264,0.814303,249.140829


This aggregation provides a clear basis for comparing engagement across sentiment categories, revealing interesting patterns about how the perceived sentiment of a post correlates with its engagement metrics.

In [8]:

engagement_metrics_by_day_and_sentiment = df.groupby(['day_of_week', 'sentiment_category']).agg({
    'num_comments': 'mean',
    'upvote_ratio': 'mean',
    'score': 'mean'
}).reset_index()


days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
engagement_metrics_by_day_and_sentiment['day_of_week'] = pd.Categorical(engagement_metrics_by_day_and_sentiment['day_of_week'], categories=days_order, ordered=True)
engagement_metrics_by_day_and_sentiment = engagement_metrics_by_day_and_sentiment.sort_values('day_of_week')


In [9]:
engagement_metrics_by_day_and_sentiment

Unnamed: 0,day_of_week,sentiment_category,num_comments,upvote_ratio,score
3,Monday,Negative,149.673203,0.785752,758.542484
4,Monday,Neutral,47.690355,0.809492,168.360406
5,Monday,Positive,71.430622,0.808612,281.339713
17,Tuesday,Positive,73.81203,0.816767,382.06391
16,Tuesday,Neutral,48.688679,0.773915,182.882075
15,Tuesday,Negative,119.554286,0.772057,587.702857
20,Wednesday,Positive,61.698485,0.815152,215.156061
18,Wednesday,Negative,91.793103,0.777026,572.637931
19,Wednesday,Neutral,67.984733,0.791336,286.923664
14,Thursday,Positive,59.70972,0.832224,212.121172


Expanding the analysis to consider the day of the week introduces an additional layer of depth, allowing the users to explore how engagement metrics fluctuate across different days for each sentiment category. 

### Top-Performing vs. Low-Performing Posts Analysis

In [10]:
df[['num_comments', 'score']].describe()

Unnamed: 0,num_comments,score
count,7760.0,7760.0
mean,64.038273,303.520232
std,264.504485,1594.499721
min,0.0,0.0
25%,1.0,1.0
50%,4.0,1.0
75%,15.0,7.0
max,8325.0,43206.0


In [11]:
# Categorizing posts based on engagement metrics
def categorize_post(row):
    if row['num_comments'] > 16 or row['score'] > 8:
        return 'Top-Performing'
    elif row['num_comments'] <= 1 and row['score'] <= 1:
        return 'Low-Performing'
    else:
        return 'Mid-Range'

# Applying the categorization function to each row in the DataFrame
df['performance_category'] = df.apply(categorize_post, axis=1)


In [12]:
df['performance_category'].value_counts()

performance_category
Mid-Range         3758
Top-Performing    2149
Low-Performing    1853
Name: count, dtype: int64

In [13]:
# Sample review for anomalies in key columns
df[['num_comments', 'score', 'sentiment_score', 'day_of_week', 'hour_of_day', 'performance_category']].sample(10)


Unnamed: 0,num_comments,score,sentiment_score,day_of_week,hour_of_day,performance_category
251,0,1,0.9001,Thursday,16,Low-Performing
3981,0,1,0.3182,Tuesday,10,Low-Performing
7249,2447,1905,0.6389,Tuesday,15,Top-Performing
1496,1,2,0.2895,Wednesday,17,Mid-Range
5086,6,3,-0.34,Thursday,3,Mid-Range
6693,0,1,0.5878,Saturday,14,Low-Performing
4680,13,0,0.9188,Sunday,19,Mid-Range
6460,7,4,0.988,Sunday,0,Mid-Range
2835,2,0,0.0,Wednesday,23,Mid-Range
6743,1,3,0.9966,Friday,23,Mid-Range


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7760 entries, 0 to 7759
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   text                    7760 non-null   object 
 1   title                   7760 non-null   object 
 2   author                  7760 non-null   object 
 3   num_comments            7760 non-null   int64  
 4   post_id                 7760 non-null   object 
 5   upvote_ratio            7760 non-null   float64
 6   score                   7760 non-null   int64  
 7   url                     7760 non-null   object 
 8   subreddit               7760 non-null   object 
 9   link_flair_text         7760 non-null   object 
 10  link_flair_template_id  7760 non-null   object 
 11  created_datetime        7760 non-null   object 
 12  day_of_week             7760 non-null   object 
 13  hour_of_day             7760 non-null   int64  
 14  month                   7760 non-null   

In [16]:
df = df[df['subreddit'] != 'EngineeringCareers']

In [21]:
## Aggregate Sentiment Scores by Performance Category within Each Subreddit

avg_sentiment_scores = df[df['performance_category'].isin(['Top-Performing', 'Low-Performing'])].groupby(['subreddit', 'performance_category'])['sentiment_score'].mean().unstack()



In [22]:
avg_sentiment_scores

performance_category,Low-Performing,Top-Performing
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1
AskHR,0.179278,0.111936
FinancialCareers,0.559093,0.5033
ITCareerQuestions,0.558012,0.524407
LegalAdviceOffTopic,-0.048032,-0.094733
careeradvice,0.621885,0.291863
careerguidance,0.567514,0.454349
cscareerquestions,0.656972,0.355896
jobs,0.452718,0.18611
resumes,0.516891,0.402797
sales,0.462591,0.43607


The above df  presents the average sentiment scores for posts categorized as "Low-Performing" and "Top-Performing" within various subreddits.

In [23]:

top_performing_posts = df[df['performance_category'] == 'Top-Performing']
low_performing_posts = df[df['performance_category'] == 'Low-Performing']


Explore the distribution of sentiment scores within top-performing and low-performing posts in each subreddit. we create two these seperate dfs.

In [24]:
top_performing_posts.head()

Unnamed: 0,text,title,author,num_comments,post_id,upvote_ratio,score,url,subreddit,link_flair_text,link_flair_template_id,created_datetime,day_of_week,hour_of_day,month,year,sentiment_score,sentiment_category,performance_category
24,Hi Ium year old worked restaurant server barte...,30 yo trying to get out of the restaurant indu...,fuckdansnydeer,39,1b4987w,0.81,12,https://www.reddit.com/r/careeradvice/comments...,careeradvice,Unknown,Unknown,2024-03-01 23:33:38,Friday,23,March,2024,0.8655,Positive,Top-Performing
32,Good evening member CareerAdvicennThis subredd...,State of the Subreddit - Announcement,michaelrulaz,0,14erg6q,0.9,42,https://www.reddit.com/r/careeradvice/comments...,careeradvice,Unknown,Unknown,2023-06-20 23:50:29,Tuesday,23,June,2023,0.9789,Positive,Top-Performing
33,I w debilitating fear job interview This got w...,My massive fear of job interviews is ruining m...,Ok-Sherbert5713,41,1bh5yuk,0.97,21,https://www.reddit.com/r/careeradvice/comments...,careeradvice,Unknown,Unknown,2024-03-17 19:19:45,Sunday,19,March,2024,0.9725,Positive,Top-Performing
34,Im currently exploring different career path I...,What career fields are less likely to be autom...,Cultural-Policy487,58,1bgz6ae,0.82,22,https://www.reddit.com/r/careeradvice/comments...,careeradvice,Unknown,Unknown,2024-03-17 14:42:53,Sunday,14,March,2024,0.6808,Positive,Top-Performing
36,Background I person job work year Hired someon...,"How can the agency hire someone $20,000 more t...",Ok_Garage3035,18,1bgz1bn,0.8,9,https://www.reddit.com/r/careeradvice/comments...,careeradvice,Unknown,Unknown,2024-03-17 14:36:57,Sunday,14,March,2024,0.4588,Neutral,Top-Performing


In [26]:
engagement_metrics_by_sentiment.to_csv('../Sentiment & Engagement Datasets/engagement_metrics_by_sentiment.csv', index=False)
engagement_metrics_by_day_and_sentiment.to_csv('../Sentiment & Engagement Datasets/engagement_metrics_by_day_and_sentiment.csv', index=False)
top_performing_posts.to_csv('../Sentiment & Engagement Datasets/top_performing_posts.csv', index=False)
low_performing_posts.to_csv('../Sentiment & Engagement Datasets/low_performing_posts.csv', index=False)
avg_sentiment_scores.to_csv('../Sentiment & Engagement Datasets/avg_sentiment_scores.csv', index=True) 
