In [26]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt


In [27]:
df = pd.read_csv('../Sentiment & Engagement Datasets/ready_data_score.csv')

In [21]:
df.head(2)

Unnamed: 0,id,text,title,author,num_comments,post_id,upvote_ratio,score,url,subreddit,link_flair_text,link_flair_template_id,created_datetime,day_of_week,hour_of_day,month,year,sentiment_score
0,41034.0,Hi context year old guy Amsterdam currently em...,Lazy job or Hard job?,Weak_Assumption_6889,8,1bfpxll,0.33,0,https://www.reddit.com/r/careeradvice/comments...,careeradvice,Unknown,Unknown,2024-03-15 22:07:22,Friday,22.0,March,2024,0.7579
1,43519.0,Looking new role havenut much traction Recentl...,Roast my Resume Pls,Neither_Trash,1,1bh8md2,0.99,1,https://i.redd.it/n918fjprlyoc1.jpeg,resumes,Review my resume • I'm in North America,c292b8e0-28b9-11ec-874c-325b17e851a3,2024-03-17 21:05:40,Sunday,21.0,March,2024,0.6369


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7760 entries, 0 to 7759
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   text                    7760 non-null   object 
 1   title                   7760 non-null   object 
 2   author                  7760 non-null   object 
 3   num_comments            7760 non-null   int64  
 4   post_id                 7760 non-null   object 
 5   upvote_ratio            7760 non-null   float64
 6   score                   7760 non-null   int64  
 7   url                     7760 non-null   object 
 8   subreddit               7760 non-null   object 
 9   link_flair_text         7760 non-null   object 
 10  link_flair_template_id  7760 non-null   object 
 11  created_datetime        7760 non-null   object 
 12  day_of_week             7760 non-null   object 
 13  hour_of_day             7760 non-null   int64  
 14  month                   7760 non-null   

### Engagement Metrics by Sentiment Score Categories

In [29]:
df['sentiment_score'].describe()

count    7760.000000
mean        0.381291
std         0.612607
min        -0.999300
25%         0.000000
50%         0.624900
75%         0.900100
max         0.999900
Name: sentiment_score, dtype: float64

In [30]:
def categorize_sentiment(score):
    if score > 0.5:
        return 'Positive'
    elif score < -0.2:
        return 'Negative'
    else:
        return 'Neutral'

# apply this function to the df
df['sentiment_category'] = df['sentiment_score'].apply(categorize_sentiment)


df['sentiment_category'].value_counts()


sentiment_category
Positive    4367
Neutral     1835
Negative    1558
Name: count, dtype: int64

In [31]:

engagement_metrics_by_sentiment = df.groupby('sentiment_category').agg({
    'num_comments': 'mean',
    'upvote_ratio': 'mean',
    'score': 'mean'
}).reset_index()

engagement_metrics_by_sentiment


Unnamed: 0,sentiment_category,num_comments,upvote_ratio,score
0,Negative,100.143774,0.771919,540.503209
1,Neutral,53.135695,0.786954,231.724796
2,Positive,55.738264,0.814303,249.140829


In [32]:

engagement_metrics_by_day_and_sentiment = df.groupby(['day_of_week', 'sentiment_category']).agg({
    'num_comments': 'mean',
    'upvote_ratio': 'mean',
    'score': 'mean'
}).reset_index()


days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
engagement_metrics_by_day_and_sentiment['day_of_week'] = pd.Categorical(engagement_metrics_by_day_and_sentiment['day_of_week'], categories=days_order, ordered=True)
engagement_metrics_by_day_and_sentiment = engagement_metrics_by_day_and_sentiment.sort_values('day_of_week')


In [33]:
engagement_metrics_by_day_and_sentiment

Unnamed: 0,day_of_week,sentiment_category,num_comments,upvote_ratio,score
3,Monday,Negative,149.673203,0.785752,758.542484
4,Monday,Neutral,47.690355,0.809492,168.360406
5,Monday,Positive,71.430622,0.808612,281.339713
17,Tuesday,Positive,73.81203,0.816767,382.06391
16,Tuesday,Neutral,48.688679,0.773915,182.882075
15,Tuesday,Negative,119.554286,0.772057,587.702857
20,Wednesday,Positive,61.698485,0.815152,215.156061
18,Wednesday,Negative,91.793103,0.777026,572.637931
19,Wednesday,Neutral,67.984733,0.791336,286.923664
14,Thursday,Positive,59.70972,0.832224,212.121172


### Top-Performing vs. Low-Performing Posts Analysis

In [34]:
df[['num_comments', 'score']].describe()

Unnamed: 0,num_comments,score
count,7760.0,7760.0
mean,64.038273,303.520232
std,264.504485,1594.499721
min,0.0,0.0
25%,1.0,1.0
50%,4.0,1.0
75%,15.0,7.0
max,8325.0,43206.0


In [35]:
# Categorizing posts based on engagement metrics
def categorize_post(row):
    if row['num_comments'] > 16 or row['score'] > 8:
        return 'Top-Performing'
    elif row['num_comments'] <= 1 and row['score'] <= 1:
        return 'Low-Performing'
    else:
        return 'Mid-Range'

# Applying the categorization function to each row in the DataFrame
df['performance_category'] = df.apply(categorize_post, axis=1)


In [36]:
df['performance_category'].value_counts()

performance_category
Mid-Range         3758
Top-Performing    2149
Low-Performing    1853
Name: count, dtype: int64

In [37]:
# Sample review for anomalies in key columns
df[['num_comments', 'score', 'sentiment_score', 'day_of_week', 'hour_of_day', 'performance_category']].sample(10)


Unnamed: 0,num_comments,score,sentiment_score,day_of_week,hour_of_day,performance_category
5053,5,1,-0.9022,Monday,17,Mid-Range
5824,9,0,0.875,Saturday,11,Mid-Range
2028,2,0,-0.7872,Friday,13,Mid-Range
2526,9,0,0.9732,Thursday,15,Mid-Range
1082,2,1,0.8225,Wednesday,3,Mid-Range
7669,102,504,0.128,Saturday,15,Top-Performing
179,5,2,0.9531,Friday,13,Mid-Range
5151,1,5,-0.3947,Tuesday,1,Mid-Range
6053,2,2,0.8074,Thursday,4,Mid-Range
6406,16,1,0.7334,Sunday,22,Mid-Range


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7760 entries, 0 to 7759
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   text                    7760 non-null   object 
 1   title                   7760 non-null   object 
 2   author                  7760 non-null   object 
 3   num_comments            7760 non-null   int64  
 4   post_id                 7760 non-null   object 
 5   upvote_ratio            7760 non-null   float64
 6   score                   7760 non-null   int64  
 7   url                     7760 non-null   object 
 8   subreddit               7760 non-null   object 
 9   link_flair_text         7760 non-null   object 
 10  link_flair_template_id  7760 non-null   object 
 11  created_datetime        7760 non-null   object 
 12  day_of_week             7760 non-null   object 
 13  hour_of_day             7760 non-null   int64  
 14  month                   7760 non-null   