In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt


In [5]:
df = pd.read_csv('../Sentiment & Engagement Datasets/ready_data_score.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7862 entries, 0 to 7861
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      7107 non-null   float64
 1   text                    7862 non-null   object 
 2   title                   7862 non-null   object 
 3   author                  7760 non-null   object 
 4   num_comments            7862 non-null   int64  
 5   post_id                 7862 non-null   object 
 6   upvote_ratio            7862 non-null   float64
 7   score                   7862 non-null   int64  
 8   url                     7862 non-null   object 
 9   subreddit               7862 non-null   object 
 10  link_flair_text         7508 non-null   object 
 11  link_flair_template_id  7508 non-null   object 
 12  created_datetime        7862 non-null   object 
 13  day_of_week             7107 non-null   object 
 14  hour_of_day             7107 non-null   

### Engagement Metrics by Sentiment Score Categories

In [8]:
df['sentiment_score'].describe()

count    7862.000000
mean        0.379324
std         0.614225
min        -0.999300
25%         0.000000
50%         0.624900
75%         0.900100
max         0.999900
Name: sentiment_score, dtype: float64

In [9]:
def categorize_sentiment(score):
    if score > 0.5:
        return 'Positive'
    elif score < -0.2:
        return 'Negative'
    else:
        return 'Neutral'

# apply this function to the df
df['sentiment_category'] = df['sentiment_score'].apply(categorize_sentiment)


df['sentiment_category'].value_counts()


sentiment_category
Positive    4416
Neutral     1856
Negative    1590
Name: count, dtype: int64

In [10]:

engagement_metrics_by_sentiment = df.groupby('sentiment_category').agg({
    'num_comments': 'mean',
    'upvote_ratio': 'mean',
    'score': 'mean'
}).reset_index()

engagement_metrics_by_sentiment


Unnamed: 0,sentiment_category,num_comments,upvote_ratio,score
0,Negative,109.101258,0.774182,581.381761
1,Neutral,53.862608,0.78673,239.193427
2,Positive,59.257699,0.815025,267.385643


In [11]:

engagement_metrics_by_day_and_sentiment = df.groupby(['day_of_week', 'sentiment_category']).agg({
    'num_comments': 'mean',
    'upvote_ratio': 'mean',
    'score': 'mean'
}).reset_index()


days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
engagement_metrics_by_day_and_sentiment['day_of_week'] = pd.Categorical(engagement_metrics_by_day_and_sentiment['day_of_week'], categories=days_order, ordered=True)
engagement_metrics_by_day_and_sentiment = engagement_metrics_by_day_and_sentiment.sort_values('day_of_week')


In [12]:
engagement_metrics_by_day_and_sentiment

Unnamed: 0,day_of_week,sentiment_category,num_comments,upvote_ratio,score
3,Monday,Negative,36.354331,0.752362,76.377953
4,Monday,Neutral,16.094444,0.791778,28.444444
5,Monday,Positive,13.986072,0.784708,20.788301
17,Tuesday,Positive,14.66453,0.798034,19.598291
16,Tuesday,Neutral,12.964646,0.757727,13.919192
15,Tuesday,Negative,30.110345,0.734483,42.503448
20,Wednesday,Positive,15.384874,0.799933,18.729412
18,Wednesday,Negative,26.542289,0.749801,64.736318
19,Wednesday,Neutral,10.210526,0.780283,20.445344
14,Thursday,Positive,12.814493,0.820116,17.508696


### Top-Performing vs. Low-Performing Posts Analysis

In [14]:
df[['num_comments', 'score']].describe()

Unnamed: 0,num_comments,score
count,7862.0,7862.0
mean,68.06436,324.232384
std,280.069188,1660.464623
min,0.0,0.0
25%,1.0,1.0
50%,4.0,1.0
75%,16.0,8.0
max,8325.0,43206.0


In [15]:
# Categorizing posts based on engagement metrics
def categorize_post(row):
    if row['num_comments'] > 16 or row['score'] > 8:
        return 'Top-Performing'
    elif row['num_comments'] <= 1 and row['score'] <= 1:
        return 'Low-Performing'
    else:
        return 'Mid-Range'

# Applying the categorization function to each row in the DataFrame
df['performance_category'] = df.apply(categorize_post, axis=1)
