In [2]:
!pip install nltk
!pip install better_profanity



In [6]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from better_profanity import profanity 
import matplotlib.pyplot as plt

In [4]:
# Download VADER lexicon if not already installed
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Optional: Load custom profanity words (if you want to extend the default list)
# profanity.load_censor_words_from_file('custom_bad_words.txt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/siwalt1/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
# Load posts and comments data
posts_df_ai = pd.read_csv('data/ai_posts.csv')
comments_df_ai = pd.read_csv('data/ai_comments.csv')
posts_df_trump = pd.read_csv('data/trump_posts.csv')
comments_df_trump = pd.read_csv('data/trump_comments.csv')
posts_df_health = pd.read_csv('data/health_posts.csv')
comments_df_health = pd.read_csv('data/health_comments.csv')

# Display the first few rows to verify
print("Posts DataFrame:")
print(posts_df_ai.head())
print("\nComments DataFrame:")
print(comments_df_ai.head())

Posts DataFrame:
  platform       id                                              title  \
0   Reddit  1hukxa5  New California law prohibits using AI as basis...   
1   Reddit  1ew0zis  Donald Trump Falsely Claims Taylor Swift Endor...   
2   Reddit  1epqzea  Trump falsely claims Harris rally crowd ‘didn’...   
3   Reddit  1evuk2a  Trump Falls For AI-Generated Taylor Swift Fans...   
4   Reddit  1eb9fyu  AOC’s Deepfake AI Porn Bill Unanimously Passes...   

  content     timestamp  comments_count  upvotes          author topic board  
0     NaN  1.736120e+09             559  28591.0     AskRedditOG   NaN   NaN  
1     NaN  1.724072e+09            1488  22062.0   MiaValeWrites   NaN   NaN  
2     NaN  1.723399e+09            1144   8092.0      majorchamp   NaN   NaN  
3     NaN  1.724047e+09             496   6332.0  TheMoonMonstar   NaN   NaN  
4     NaN  1.721847e+09             400   9090.0    rollingstone   NaN   NaN  

Comments DataFrame:
  comment_id                               

In [10]:
# Function to get sentiment score https://www.nltk.org/api/nltk.sentiment.vader.html#nltk.sentiment.vader.SentimentIntensityAnalyzer.polarity_scores
def get_sentiment(text):
    if pd.isna(text):  # Handle missing/empty text
        return 0.0
    scores = sid.polarity_scores(text) #{'neg': 0.0, 'neu': 0.278, 'pos': 0.722, 'compound': 0.3804}
    return scores['compound']

In [15]:
# Function to get sentiment score
def get_sentiment(text):
    if pd.isna(text):  # Handle missing/empty text
        return 0.0
    scores = sid.polarity_scores(text)
    return scores['compound']

# Apply sentiment analysis to post titles
posts_df_ai['title_sentiment'] = posts_df_ai['title'].apply(get_sentiment)
posts_df_trump['title_sentiment'] = posts_df_trump['title'].apply(get_sentiment)
posts_df_health['title_sentiment'] = posts_df_health['title'].apply(get_sentiment)

# Apply sentiment analysis to comments
comments_df_ai['comment_sentiment'] = comments_df_ai['comment_text'].apply(get_sentiment)
comments_df_trump['comment_sentiment'] = comments_df_trump['comment_text'].apply(get_sentiment)
comments_df_health['comment_sentiment'] = comments_df_health['comment_text'].apply(get_sentiment)

# Summary statistics for sentiment
print("\nPost Title Sentiment Summary:")
print(posts_df_ai['title_sentiment'].describe())
print("\nComment Sentiment Summary:")
print(comments_df_ai['comment_sentiment'].describe())


Post Title Sentiment Summary:
count    341.000000
mean      -0.065956
std        0.341506
min       -0.897900
25%       -0.296000
50%        0.000000
75%        0.000000
max        0.794400
Name: title_sentiment, dtype: float64

Comment Sentiment Summary:
count    35949.00000
mean        -0.04230
std          0.49339
min         -0.99840
25%         -0.44040
50%          0.00000
75%          0.33820
max          0.99980
Name: comment_sentiment, dtype: float64


In [17]:
# Individual sentiment calculations (already provided)
reddit_posts_sentiment_ai = posts_df_ai[posts_df_ai['platform'] == 'Reddit']['title_sentiment'].agg(['mean', 'std'])
chan_posts_sentiment_ai = posts_df_ai[posts_df_ai['platform'] == '4chan']['title_sentiment'].agg(['mean', 'std'])
reddit_comments_sentiment_ai = comments_df_ai[comments_df_ai['platform'] == 'Reddit']['comment_sentiment'].agg(['mean', 'std'])
chan_comments_sentiment_ai = comments_df_ai[comments_df_ai['platform'] == '4chan']['comment_sentiment'].agg(['mean', 'std'])

reddit_posts_sentiment_trump = posts_df_trump[posts_df_trump['platform'] == 'Reddit']['title_sentiment'].agg(['mean', 'std'])
chan_posts_sentiment_trump = posts_df_trump[posts_df_trump['platform'] == '4chan']['title_sentiment'].agg(['mean', 'std'])
reddit_comments_sentiment_trump = comments_df_trump[comments_df_trump['platform'] == 'Reddit']['comment_sentiment'].agg(['mean', 'std'])
chan_comments_sentiment_trump = comments_df_trump[comments_df_trump['platform'] == '4chan']['comment_sentiment'].agg(['mean', 'std'])

reddit_posts_sentiment_health = posts_df_health[posts_df_health['platform'] == 'Reddit']['title_sentiment'].agg(['mean', 'std'])
chan_posts_sentiment_health = posts_df_health[posts_df_health['platform'] == '4chan']['title_sentiment'].agg(['mean', 'std'])
reddit_comments_sentiment_health = comments_df_health[comments_df_health['platform'] == 'Reddit']['comment_sentiment'].agg(['mean', 'std'])
chan_comments_sentiment_health = comments_df_health[comments_df_health['platform'] == '4chan']['comment_sentiment'].agg(['mean', 'std'])

# Combined sentiments (posts and comments)
combined_sentiment_ai_reddit = pd.concat([posts_df_ai[posts_df_ai['platform'] == 'Reddit']['title_sentiment'], 
                                         comments_df_ai[comments_df_ai['platform'] == 'Reddit']['comment_sentiment']]).agg(['mean', 'std'])
combined_sentiment_ai_4chan = pd.concat([posts_df_ai[posts_df_ai['platform'] == '4chan']['title_sentiment'], 
                                        comments_df_ai[comments_df_ai['platform'] == '4chan']['comment_sentiment']]).agg(['mean', 'std'])

combined_sentiment_trump_reddit = pd.concat([posts_df_trump[posts_df_trump['platform'] == 'Reddit']['title_sentiment'], 
                                            comments_df_trump[comments_df_trump['platform'] == 'Reddit']['comment_sentiment']]).agg(['mean', 'std'])
combined_sentiment_trump_4chan = pd.concat([posts_df_trump[posts_df_trump['platform'] == '4chan']['title_sentiment'], 
                                           comments_df_trump[comments_df_trump['platform'] == '4chan']['comment_sentiment']]).agg(['mean', 'std'])

combined_sentiment_health_reddit = pd.concat([posts_df_health[posts_df_health['platform'] == 'Reddit']['title_sentiment'], 
                                             comments_df_health[comments_df_health['platform'] == 'Reddit']['comment_sentiment']]).agg(['mean', 'std'])
combined_sentiment_health_4chan = pd.concat([posts_df_health[posts_df_health['platform'] == '4chan']['title_sentiment'], 
                                            comments_df_health[comments_df_health['platform'] == '4chan']['comment_sentiment']]).agg(['mean', 'std'])

# Printing per topic
print("=== AI Topic ===")
print("\nReddit Post Title Sentiment:")
print(reddit_posts_sentiment_ai)
print("\n4chan Post Title Sentiment:")
print(chan_posts_sentiment_ai)
print("\nReddit Comment Sentiment:")
print(reddit_comments_sentiment_ai)
print("\n4chan Comment Sentiment:")
print(chan_comments_sentiment_ai)
print("\nReddit Combined Sentiment (Posts + Comments):")
print(combined_sentiment_ai_reddit)
print("\n4chan Combined Sentiment (Posts + Comments):")
print(combined_sentiment_ai_4chan)

print("\n=== Trump Topic ===")
print("\nReddit Post Title Sentiment:")
print(reddit_posts_sentiment_trump)
print("\n4chan Post Title Sentiment:")
print(chan_posts_sentiment_trump)
print("\nReddit Comment Sentiment:")
print(reddit_comments_sentiment_trump)
print("\n4chan Comment Sentiment:")
print(chan_comments_sentiment_trump)
print("\nReddit Combined Sentiment (Posts + Comments):")
print(combined_sentiment_trump_reddit)
print("\n4chan Combined Sentiment (Posts + Comments):")
print(combined_sentiment_trump_4chan)

print("\n=== Health Topic ===")
print("\nReddit Post Title Sentiment:")
print(reddit_posts_sentiment_health)
print("\n4chan Post Title Sentiment:")
print(chan_posts_sentiment_health)
print("\nReddit Comment Sentiment:")
print(reddit_comments_sentiment_health)
print("\n4chan Comment Sentiment:")
print(chan_comments_sentiment_health)
print("\nReddit Combined Sentiment (Posts + Comments):")
print(combined_sentiment_health_reddit)
print("\n4chan Combined Sentiment (Posts + Comments):")
print(combined_sentiment_health_4chan)

# Overall combined sentiment per topic (across platforms)
combined_sentiment_ai_all = pd.concat([posts_df_ai['title_sentiment'], comments_df_ai['comment_sentiment']]).agg(['mean', 'std'])
combined_sentiment_trump_all = pd.concat([posts_df_trump['title_sentiment'], comments_df_trump['comment_sentiment']]).agg(['mean', 'std'])
combined_sentiment_health_all = pd.concat([posts_df_health['title_sentiment'], comments_df_health['comment_sentiment']]).agg(['mean', 'std'])

print("\n=== Overall Combined Sentiment Per Topic (Across Platforms) ===")
print("\nAI Topic (All Platforms):")
print(combined_sentiment_ai_all)
print("\nTrump Topic (All Platforms):")
print(combined_sentiment_trump_all)
print("\nHealth Topic (All Platforms):")
print(combined_sentiment_health_all)

=== AI Topic ===

Reddit Post Title Sentiment:
mean   -0.106148
std     0.372706
Name: title_sentiment, dtype: float64

4chan Post Title Sentiment:
mean   -0.049279
std     0.327060
Name: title_sentiment, dtype: float64

Reddit Comment Sentiment:
mean   -0.014871
std     0.489355
Name: comment_sentiment, dtype: float64

4chan Comment Sentiment:
mean   -0.082757
std     0.496541
Name: comment_sentiment, dtype: float64

Reddit Combined Sentiment (Posts + Comments):
mean   -0.015295
std     0.488910
dtype: float64

4chan Combined Sentiment (Posts + Comments):
mean   -0.082211
std     0.494253
dtype: float64

=== Trump Topic ===

Reddit Post Title Sentiment:
mean   -0.189919
std     0.366082
Name: title_sentiment, dtype: float64

4chan Post Title Sentiment:
mean   -0.055429
std     0.366848
Name: title_sentiment, dtype: float64

Reddit Comment Sentiment:
mean   -0.064080
std     0.509328
Name: comment_sentiment, dtype: float64

4chan Comment Sentiment:
mean   -0.123118
std     0.534463
Nam