In [1]:
import pandas as pd
import os
import numpy as np

In [147]:
# Function to count comments in a row
def count_comments(row):
    # Check for empty or null values
    if row in ['', None, np.nan]:
        return 0

    # Check if it's a string (for line-based counting)
    if isinstance(row, str):
        # Count lines based on separators (\n, \n\n, .)
        return len([line for line in row.split('\n') if line.strip()])  # Counting non-empty lines

    # Check if it's a list (counting elements)
    elif isinstance(row, list):
        # Flatten nested lists
        flattened_list = []
        for item in row:
            if isinstance(item, list):  # If the item is a nested list
                flattened_list.extend(item)
            else:
                flattened_list.append(item)
        return len(flattened_list)

    return 0  # Default case

In [148]:
base_dir = os.getcwd()  # Get the script's directory
output_dir = os.path.join(base_dir, "..", "..", "output")

file_path1 = os.path.join(output_dir, "auto_labeled_data.xlsx")
df = pd.read_excel(file_path1)

In [149]:
# Count the number of comments in each message
df['Comment_Count'] = df['Total_Comments'].apply(count_comments)


In [150]:
df['Comment_Count'].nunique()

13

In [151]:
# Calculate skewness for Total_Comments_Count, Text_Positive_Reactions, and Text_Negative_Reactions
skew_comments = df["Comment_Count"].skew()
skew_pos_reactions = df["Text_Positive_Reactions"].skew()
skew_neg_reactions = df["Text_Negative_Reactions"].skew()

print(f"Skewness of Total_Comments_Count: {skew_comments}")
print(f"Skewness of Text_Positive_Reactions: {skew_pos_reactions}")
print(f"Skewness of Text_Negative_Reactions: {skew_neg_reactions}")

Skewness of Total_Comments_Count: 4.150450974300998
Skewness of Text_Positive_Reactions: 6.409441046517418
Skewness of Text_Negative_Reactions: 14.831310991166937


In [152]:
# Define percentile threshold at 99 (top 1%)
percentile_threshold = 99

In [153]:
comment_thresh = df["Comment_Count"].quantile(percentile_threshold / 100)
pos_reaction_thresh = df["Text_Positive_Reactions"].quantile(percentile_threshold / 100)
neg_reaction_thresh = df["Text_Negative_Reactions"].quantile(percentile_threshold / 100)

In [154]:
# Identify viral messages
df_viral_comments = df[
    (df["Comment_Count"] >= comment_thresh)
]

df_viral_pos_reactions = df[
    (df["Text_Positive_Reactions"] >= pos_reaction_thresh)
]

df_viral_neg_reactions = df[
    (df["Text_Negative_Reactions"] >= pos_reaction_thresh)
]

In [155]:
# Sort by total comments and select top 5
df_viral_top_comments = df_viral_comments.sort_values(by="Comment_Count", ascending=False).head(5)
df_viral_top_pos_reactions = df_viral_pos_reactions.sort_values(by="Text_Positive_Reactions", ascending=False).head(5)
df_viral_top_neg_reactions = df_viral_neg_reactions.sort_values(by="Text_Positive_Reactions", ascending=False).head(5)

In [162]:
df_viral_top_comments_text = df_viral_top_comments[['Combined_text']]
df_viral_top_pos_reactions_text = df_viral_top_pos_reactions[['Combined_text']]
df_viral_top_neg_reactions_text = df_viral_top_neg_reactions[['Combined_text']]

df_viral_combined_text = pd.concat([df_viral_top_comments_text, df_viral_top_pos_reactions_text, df_viral_top_neg_reactions_text], axis=1)

df_viral_combined_text.columns = ['Comments_Combined_text', 'Pos_Reactions_Combined_text', 'Neg_Reactions_Combined_text']

file_path1 = os.path.join(output_dir, "viral_news.csv")
df_viral_combined_text.to_csv(file_path1, index=False)