**1.0 Data Preparation (non-rerunnable, privacy-restriced)**

The comments have been extracted externally and uploaded into a dataset, accessible via google drive. This part reviews the data and prepares it by removing any deleted comments, adding a stable index and anonymising usernames for privacy reasons. The stable index ensures consistency during the thematic coding, especially considering the dataset may be reloaded which causes default indexes to change.

For privacy reasons, this part of the data analysis is not rerunable in Github, as the file contains the original authornames. For rerunning the code, please start from step **1.1 GitHub-Ready Code**.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
file_path = "/content/drive/MyDrive/Dataprojects/Methodology/videoinfo_SdSSPF1S-Uc&t=24s_2025_10_16-23_09_59_comments.csv"
df_new = pd.read_csv(file_path)

print(f"‚úÖ Loaded {len(df_new):,} rows from {file_path}")
pd.set_option('display.max_colwidth', None)
df_new['text'].head(10)


In [None]:
import pandas as pd

file_path = "/content/drive/MyDrive/Dataprojects/Methodology/videoinfo_SdSSPF1S-Uc&t=24s_2025_10_16-23_09_59_comments.csv"
df_raw = pd.read_csv(file_path)

print(f"üì• Loaded {len(df_raw):,} rows")

text_col = None
for c in ["text", "comment", "body", "content"]:
    if c in df_raw.columns:
        text_col = c
        break

if text_col is None:
    raise ValueError("‚ùå No comment text column found")

print(f"üìù Using text column: {text_col}")

df = df_raw.copy()

mask_blank = (
    df[text_col].isna() |
    df[text_col].astype(str).str.strip().eq("")
)

num_blank = mask_blank.sum()
num_total = len(df)

print(f"\nüóëÔ∏è Blank / unavailable comments detected: {num_blank:,}")
print(f"üìä Share of dataset removed: {num_blank / num_total * 100:.2f}%")

df_clean = df[~mask_blank].copy()

df_clean = df_clean.reset_index(drop=True)
df_clean["stable_comment_index"] = df_clean.index

print(f"‚ú® Cleaned dataset size: {len(df_clean):,}")

USERNAME_COL = "authorName"

if USERNAME_COL not in df_clean.columns:
    raise ValueError(f"‚ùå Username column '{USERNAME_COL}' not found")

unique_authors = df_clean[USERNAME_COL].dropna().unique()
author_to_anon = {author: f"User_{i+1}" for i, author in enumerate(unique_authors)}

df_clean["user_anonymised"] = df_clean[USERNAME_COL].map(author_to_anon)
df_clean["user_anonymised"] = df_clean["user_anonymised"].fillna("user_deleted")

print(f"üë§ Created 'user_anonymised' for {len(unique_authors):,} users.")

df_clean.drop(columns=[USERNAME_COL], inplace=True)
print(f"üîí Dropped identifying column: '{USERNAME_COL}'")

final_output_path = "/content/drive/MyDrive/Dataprojects/Methodology/youtube_final_dataset.csv"
df_clean.to_csv(final_output_path, index=False)

print(f"\nüíæ Final dataset saved to:")
print(final_output_path)
print(f"üìå Total final comments: {len(df_clean):,}")


**1.1 GitHub-Ready Code**


In [None]:
from pathlib import Path
import pandas as pd


CANDIDATE_PATHS = [
    Path("data/raw/youtube_final_dataset.csv"),
    Path("../data/raw/youtube_final_dataset.csv"),
]

CLEAN_YOUTUBE_DATA = next((p for p in CANDIDATE_PATHS if p.exists()), None)
if CLEAN_YOUTUBE_DATA is None:
    raise FileNotFoundError(
        "Could not find youtube_final_dataset.csv. Expected it in data/raw/.\n"
        "Make sure you've cloned the repo and the file exists at data/raw/youtube_final_dataset.csv"
    )

df_youtube = pd.read_csv(CLEAN_YOUTUBE_DATA)
print(f"Loaded {len(df_youtube):,} YouTube rows from: {CLEAN_YOUTUBE_DATA}")


**1.2 Reviewing Duplicate Comments**

This part includes reviewing the metrics of identical comments, to determine whether they are true or accidental duplications.

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from IPython.display import display


df_youtube = pd.read_csv(CLEAN_YOUTUBE_DATA)
print(f"Loaded {len(df_youtube):,} YouTube rows")

text_col = "text"

df_youtube["publishedAt"] = pd.to_datetime(df_youtube["publishedAt"], errors="coerce")

duplicate_counts = df_youtube[text_col].value_counts()
duplicate_texts = duplicate_counts[duplicate_counts > 1].index.tolist()

print(f"üîç Found {len(duplicate_texts)} duplicated unique YouTube comments.")

duplicates_full = df_youtube[df_youtube[text_col].isin(duplicate_texts)].copy()

def summarize_group(g):
    g = g.sort_values("publishedAt")

    freq = len(g)

    rows = []
    for _, r in g.iterrows():
        rows.append(
            f"user={r.get('user_anonymised', '')}, "
            f"likes={r.get('likeCount', '')}, "
            f"publishedAt={r.get('publishedAt', '')}"
        )

    instances_str = "\n".join(rows)

    return pd.Series({
        "frequency": freq,
        "instances": instances_str
    })

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    summary_youtube = duplicates_full.groupby(text_col).apply(summarize_group).reset_index()

summary_youtube = summary_youtube.sort_values("frequency", ascending=False).reset_index(drop=True)

summary_youtube.index = summary_youtube.index + 1

# Display table
pd.set_option("display.max_colwidth", None)

display(
    summary_youtube.style.set_properties(
        subset=[text_col],
        **{
            "white-space": "normal",
            "max-width": "300px",
        }
    ).set_properties(
        subset=["instances"],
        **{
            "white-space": "pre-wrap",
            "max-width": "1000px",
            "font-family": "monospace"
        }
    )
)


**2.0 Opinion Leaders**

The opinion leaders are identified using the benchmark of 75% of the total likecount/upvotes of the overall dataset. After knowing how many comments are in this subset, they are reviewed with their ranking, stable index, like count and text.

In [None]:
import pandas as pd

df = pd.read_csv(CLEAN_YOUTUBE_DATA)

LIKES_COL = "likeCount"


if LIKES_COL not in df.columns:
    raise ValueError(f"Column '{LIKES_COL}' not found. Available columns: {list(df.columns)}")


df = df[df[LIKES_COL].notna() & (df[LIKES_COL] >= 0)]


total_comments = len(df)
total_likes = df[LIKES_COL].sum()


if total_comments == 0 or total_likes == 0:
    print("No comments or no likes in the dataset.")
else:
    # 75% benchmark of the total likes
    target_likes_75 = 0.75 * total_likes

    df_sorted = df.sort_values(LIKES_COL, ascending=False).reset_index(drop=True)
    df_sorted["cum_likes"] = df_sorted[LIKES_COL].cumsum()

    first_row_reaching_75 = df_sorted[df_sorted["cum_likes"] >= target_likes_75].index.min()
    comments_needed = int(first_row_reaching_75) + 1

    percent_of_dataset = (comments_needed / total_comments) * 100


    print(f"Total comments: {total_comments:,}")
    print(f"Total likes: {total_likes:,}")
    print(f"75% benchmark of total likes: {target_likes_75:,.0f}")
    print(f"Comments needed to reach 75% of likes: {comments_needed:,}")
    print(f"These comments represent {percent_of_dataset:.2f}% of the dataset.")


In [None]:
import pandas as pd

df = pd.read_csv(CLEAN_YOUTUBE_DATA)

LIKES_COL = "likeCount"
TEXT_COL = "text"


if LIKES_COL not in df.columns:
    raise ValueError(f"Column '{LIKES_COL}' not found. Available columns: {list(df.columns)}")

if TEXT_COL not in df.columns:
    raise ValueError(f"Column '{TEXT_COL}' not found. Available columns: {list(df.columns)}")


opinionleaders_youtube = (
    df.sort_values(LIKES_COL, ascending=False)
      .head(35)
      .copy()
)

# Preserve original index
opinionleaders_youtube["index"] = opinionleaders_youtube.index


opinionleaders_youtube.index = range(1, len(opinionleaders_youtube) + 1)
opinionleaders_youtube.index.name = None


opinionleaders_youtube = opinionleaders_youtube[[LIKES_COL, "index", TEXT_COL]]


styled_table_youtube = opinionleaders_youtube.style.set_properties(
    subset=[TEXT_COL],
    **{
        "white-space": "normal",
        "word-wrap": "break-word",
        "width": "550px"
    }
)

styled_table_youtube


**2.1 Thematic Analysis**

This part categorizes each comment within the subset of opinion leaders. The following themes have been identified:

-  **Nostalgia & Comeback Angels**: Appreciation for the original models and nostalgic elements of the traditional VS shows
- **VS Show Production**: Regarding the overall experience of watching the show & highlighting certain segments or specific elements
-   **Inclusivity**: Positive feedback on a variety of cultures, sizes, races, etc.
-   **Music Artists**: Regarding the performances and artists
-   **Sisterhood**: Highlighting discourse surrounding women's empowerment and the collective identity of girls
-   **Year on Year**: Highlighting this year's improvement & comparisons to last year
-   **Specific Model References**: Specifically about particular models and their stage performance
-   **Timestamps**: Listing times of performances, etc.

Below follows the thematic distribution of comments and likecounts, visualised in a pie chart. Furthermore, all comments and their related themes are be provided as well.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

df_clean = pd.read_csv(CLEAN_YOUTUBE_DATA)
print(f"Loaded YouTube dataset with {len(df_clean):,} rows.")

themes_by_index = {
    "Nostalgia & Comeback Angels": [9411, 9490, 10402, 12158, 15288, 6338, 19051, 12483, 17573],
    "Inclusivity": [13333, 7152, 8481],
    "VS ShowProduction": [19552, 14457, 13333, 15288, 2817, 15498, 5575],
    "Music Artists": [10447, 7549, 12158, 18322, 19023, 3051, 16332, 15498, 11539, 4070, 17573],
    "Timestamps": [16694, 3538],
    "Sisterhood": [17673, 19051, 17901, 7152],
    "Year on Year": [13333, 19318, 16141, 4070, 14037],
    "Specific Model References": [18671, 14457, 8125, 19278],
}

candidate_cols = []
for c in df_clean.columns:
    cl = c.lower()
    if cl in ["text", "content", "body", "comment", "likecount", "score"]:
        candidate_cols.append(c)

if not candidate_cols:
    raise ValueError(
        "No usable columns found. Expected one of: text/content/body/comment and likeCount/score."
    )

subset = df_clean[candidate_cols].copy()

text_col = None
for c in subset.columns:
    if c.lower() in ["text", "content", "body", "comment"]:
        text_col = c
        break
if text_col is None:
    raise ValueError("No text column found (expected 'text', 'content', 'body', or 'comment').")

likes_col = None
for c in subset.columns:
    if c.lower() in ["likecount", "score"]:
        likes_col = c
        break
if likes_col is None:
    raise ValueError("No likes column found (expected 'likeCount' or 'score').")

subset = subset.rename(columns={text_col: "comment", likes_col: "likeCount"})

subset["likeCount"] = subset["likeCount"].fillna(0).astype(int)
subset["comment"] = subset["comment"].astype(str).fillna("").str.strip()

top_comments_df = (
    subset.sort_values("likeCount", ascending=False)
          .head(35)[["comment", "likeCount"]]
          .copy()
)

ranked = top_comments_df.reset_index().rename(columns={"index": "orig_index"})


index_to_themes = {}
for theme, indices in themes_by_index.items():
    for idx in indices:
        index_to_themes.setdefault(idx, []).append(theme)

def join_labels(idx):
    labels = index_to_themes.get(idx, [])
    return ", ".join(labels) if labels else ""

ranked["themes"] = ranked["orig_index"].apply(join_labels)

missing = ranked.loc[ranked["themes"].eq(""), "orig_index"].tolist()
if missing:
    print(f"‚ö†Ô∏è Dataset indices in top-35 with no theme label: {missing}")
else:
    print("‚úÖ All top-35 comments have at least one theme label.")

ranked["theme_list"] = ranked["themes"].apply(
    lambda x: x.split(", ") if isinstance(x, str) and x else []
)

long = ranked.explode("theme_list").rename(columns={"theme_list": "theme"})
long = long[long["theme"] != ""]


counts = long.groupby("theme")["orig_index"].nunique().sort_values(ascending=False)
likes = long.groupby("theme")["likeCount"].sum().sort_values(ascending=False)

total_likes = likes.sum()

summary_df = pd.DataFrame({
    "Comments": counts,
    "Likes": likes,
    "Share of Likes (%)": (likes / total_likes * 100).round(1),
}).sort_values("Likes", ascending=False)

print("\nüìä THEME SUMMARY (Top 35 YouTube Comments)\n")
print(summary_df.to_string())

# --- Plot ---
plt.figure(figsize=(10, 7))

wedges, texts, autotexts = plt.pie(
    summary_df["Likes"],
    labels=None,
    autopct="%1.1f%%",
    startangle=140,
    pctdistance=0.75
)

plt.title("Thematic Engagement Distribution")

plt.legend(
    wedges,
    summary_df.index,
    title="Theme",
    loc="center left",
    bbox_to_anchor=(1.02, 0.5),
    frameon=False
)

plt.tight_layout()
plt.show()

# --- GitHub-ready output path (stable) ---
OUTPUT_DIR = Path("data/processed")
if not OUTPUT_DIR.exists():
    alt = Path("../data/processed")
    if alt.parent.exists():
        OUTPUT_DIR = alt

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_PATH = OUTPUT_DIR / "top35_youtube_comments_labeled_by_theme.csv"
ranked.to_csv(OUT_PATH, index=False, encoding="utf-8")
print(f"\nüìÅ Saved labeled top-35 to: {OUT_PATH}")


In [None]:
import pandas as pd
import textwrap
from pathlib import Path

# --- Find the labeled file in common locations ---
CANDIDATE_PATHS = [
    Path("data/processed/top35_youtube_comments_labeled_by_theme.csv"),
    Path("../data/processed/top35_youtube_comments_labeled_by_theme.csv"),
]

LABELED_PATH = next((p for p in CANDIDATE_PATHS if p.exists()), None)
if LABELED_PATH is None:
    raise FileNotFoundError(
        "Could not find the labeled CSV. Run the previous code block first to generate it.\n"
        "Expected: data/processed/top35_youtube_comments_labeled_by_theme.csv"
    )

df_labeled_comments = pd.read_csv(LABELED_PATH)
print(f"‚úÖ Loaded labeled comments from: {LABELED_PATH}")

if "index" in df_labeled_comments.columns:
    df_labeled_comments = df_labeled_comments.rename(columns={"index": "orig_index"})

df_labeled_comments["themes"] = df_labeled_comments["themes"].apply(
    lambda x: [theme.strip() for theme in x.split(",")] if pd.notna(x) and x else []
)

all_unique_themes = sorted(
    {theme for sublist in df_labeled_comments["themes"] for theme in sublist}
)

MAX_WIDTH = 100

print("\n=== YouTube Comments by Theme (Top 35) ===\n")

for theme in all_unique_themes:
    print(f"\n--- üîµ Theme: {theme} ---")

    theme_comments = df_labeled_comments[
        df_labeled_comments["themes"].apply(lambda x: theme in x)
    ]

    if theme_comments.empty:
        print(f"  No comments found for '{theme}'.")
        continue

    theme_comments = theme_comments.sort_values(by="likeCount", ascending=False)

    for i, row in enumerate(theme_comments.itertuples(index=False), start=1):
        wrapped_comment = textwrap.fill(
            row.comment,
            width=MAX_WIDTH,
            break_long_words=False,
            replace_whitespace=False,
        )
        print(f"  {i}.  {row.likeCount} likes  |  Original Index: {row.orig_index}")
        print(f"     {wrapped_comment}")
        print("-" * MAX_WIDTH)


**3.0 Sentiment Analysis**


This includes a sentiment analysis on the overall dataset, as well as a manual review of 100 comments. Here, the doubtful or incorrect classifications are separated for closer review, before calculating the accuracy rate.  

In [None]:
!pip install -q transformers torch

import pandas as pd
import torch
import time
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

df = pd.read_csv(CLEAN_YOUTUBE_DATA)
print(f"‚úÖ Loaded {len(df):,} YouTube rows from:\n{CLEAN_YOUTUBE_DATA}")

# --- GitHub-ready output path (stable) ---
# Default assumes running from repo root. If running from /notebooks, fall back to ../data/processed.
OUTPUT_DIR = Path("data/processed")
if not OUTPUT_DIR.exists():
    alt = Path("../data/processed")
    if alt.parent.exists():
        OUTPUT_DIR = alt

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_YOUTUBE_SENTIMENT_PATH = OUTPUT_DIR / "youtube_final_dataset_sentiment.csv"

text_col = next((c for c in ["comment", "text", "content", "body"] if c in df.columns), None)
if text_col is None:
    raise ValueError("No text column found (expected one of: comment, text, content, body).")

print(f"üìù Using text column: {text_col}")

df[text_col] = df[text_col].fillna("").astype(str).str.strip()
df = df[~df[text_col].isin(["[deleted]", "[removed]"])].reset_index(drop=True)

model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=512,
    truncation=True,
    padding_side="right"
)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = 0 if torch.cuda.is_available() else -1
sentiment_pipeline = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=device
)

texts = df[text_col].tolist()
batch_size = 128
results = []

print("\nüîÑ Running 3-class sentiment (pos/neu/neg) with truncation to 512 tokens...\n")
for i in range(0, len(texts), batch_size):
    batch = texts[i:i + batch_size]
    out = sentiment_pipeline(batch, truncation=True, padding=True, max_length=512)
    results.extend(out)

    if (i // batch_size) % 10 == 0:
        print(f"Processed {min(i + batch_size, len(texts))}/{len(texts)}", end="\r")

    time.sleep(0.01)

print("\n‚úÖ Sentiment analysis complete!")

df["sentiment_label"] = [r["label"].upper() for r in results]
df["sentiment_score"] = [r["score"] for r in results]

df.to_csv(OUTPUT_YOUTUBE_SENTIMENT_PATH, index=False, encoding="utf-8")
print(f"üíæ Saved YouTube sentiment file to:\n{OUTPUT_YOUTUBE_SENTIMENT_PATH}")

df[[text_col, "sentiment_label", "sentiment_score"]].head(5)


In [None]:
import pandas as pd
from pathlib import Path

# --- Find the sentiment file in common locations ---
CANDIDATE_PATHS = [
    Path("data/processed/youtube_final_dataset_sentiment.csv"),
    Path("../data/processed/youtube_final_dataset_sentiment.csv"),
]

YOUTUBE_SENTIMENT = next((p for p in CANDIDATE_PATHS if p.exists()), None)
if YOUTUBE_SENTIMENT is None:
    raise FileNotFoundError(
        "Could not find youtube_final_dataset_sentiment.csv.\n"
        "Run the sentiment generation block first.\n"
        "Expected: data/processed/youtube_final_dataset_sentiment.csv"
    )

df_youtube = pd.read_csv(YOUTUBE_SENTIMENT)
print(f"Loaded {len(df_youtube):,} YouTube rows from: {YOUTUBE_SENTIMENT}")


In [None]:
import pandas as pd

df_loaded = pd.read_csv(YOUTUBE_SENTIMENT)
text_col = next((c for c in ["text","content","body","comment"] if c in df_loaded.columns), None)
df_loaded[[text_col, "sentiment_label", "sentiment_score"]].head(100)



In [None]:
import pandas as pd


df = pd.read_csv(YOUTUBE_SENTIMENT)

text_col = next((c for c in ["text","content","body","comment"] if c in df.columns), None)
if text_col is None:
    raise ValueError("No text column found in dataset.")

selected_indices = [17, 52, 55, 62, 78, 86, 97]

df.loc[selected_indices, [text_col, "sentiment_label", "sentiment_score"]]


**Review of Misclassficiations**

The comment referring to the high stream viewership suggests popularity and mass appeal, which could be interpreted as a more positive sentiment towards the brand, even though it has been classified as negative. The dataset struggles with slang, as ‚Äòate‚Äô is used as a positive description, yet classified negative. ‚ÄòMy religion‚Äô is classified as neutral, while it implies strong devotion and admiration, thus better suited as a positive sentiment. Similarly, ‚Äòbangers‚Äô is often used to reference good sounding songs, therefore better suited as positive sentiment, also explaining why the user would want to know more information about them. ‚ÄòBring back real models‚Äô is classified as neutral, yet highlights the lack thereof in reference to the show. Therefore, it creates a more negative frame of the brand. Then the final comment is framed as negative, even though the term ‚ÄòDesi ness‚Äô is a cultural reference that isn‚Äôt strictly positive or negative. Therefore, a neutral category would have suited this expression better.


*Sentiment accuracy rate of 93%*

**3.1 Sentiment Popularity**

This reviews the popularity of each sentiment, first by reviewing their overall presence in the dataset, the top comments of each sentiment and the average like count per category.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(YOUTUBE_SENTIMENT)

counts = df["sentiment_label"].value_counts()
percentages = (counts / counts.sum() * 100).round(2)

color_map = {
    "POSITIVE": "#2ECC71",
    "NEGATIVE": "#E74C3C",
    "NEUTRAL": "#F1C40F"
}

bar_colors = [color_map.get(label.upper(), "gray") for label in counts.index]


plt.figure(figsize=(6,4))
bars = plt.bar(counts.index, counts.values, color=bar_colors)
plt.title("YouTube Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Number of Comments")
plt.xticks(rotation=0)

for bar, pct in zip(bars, percentages):
    plt.text(
        bar.get_x() + bar.get_width()/2,
        bar.get_height() + counts.max()*0.01,
        f"{pct}%",
        ha="center",
        va="bottom",
        fontsize=10
    )

plt.tight_layout()
plt.show()


print("Sentiment distribution (% of total):\n")
for label, pct in percentages.items():
    print(f"{label}: {pct:.2f}%")


In [None]:
import pandas as pd

df = pd.read_csv(YOUTUBE_SENTIMENT)
df.columns = df.columns.str.lower()
print(f"Comments loaded: {len(df):,}")

text_col = next((c for c in ["text", "content", "body", "comment"] if c in df.columns), None)
like_col = next((c for c in ["likecount", "score", "likes"] if c in df.columns), None)

if text_col is None:
    raise ValueError("‚ùå No text column found (expected 'text', 'content', 'body', or 'comment').")
if like_col is None:
    print("‚ö†Ô∏è No likeCount/score column found ‚Äî defaulting likes to 0.")
    df["likecount"] = 0
    like_col = "likecount"

# --- Top 15 NEGATIVE by likes ---
most_negative_upvoted = (
    df[df["sentiment_label"].str.lower().str.contains("neg")]
    .sort_values(like_col, ascending=False)
    .head(15)
)

print("\nüî¥ Top 15 Most-Liked Negative Comments:\n")
for _, row in most_negative_upvoted.iterrows():
    print(f"üí¨ {str(row[text_col])[:500]}")
    print(f"   üîπ Sentiment: {row['sentiment_label']} ({row['sentiment_score']:.3f}) | Likes: {row[like_col]}\n")

# --- Top 15 POSITIVE by likes ---
most_positive_upvoted = (
    df[df["sentiment_label"].str.lower().str.contains("pos")]
    .sort_values(like_col, ascending=False)
    .head(15)
)

print("\nüü¢ Top 15 Most-Liked Positive Comments:\n")
for _, row in most_positive_upvoted.iterrows():
    print(f"üí¨ {str(row[text_col])[:500]}")
    print(f"   üîπ Sentiment: {row['sentiment_label']} ({row['sentiment_score']:.3f}) | Likes: {row[like_col]}\n")

# --- Top 15 NEUTRAL by likes ---
most_neutral_upvoted = (
    df[df["sentiment_label"].str.lower().str.contains("neu")]
    .sort_values(like_col, ascending=False)
    .head(15)
)

print("\nüü° Top 15 Most-Liked Neutral Comments:\n")
for _, row in most_neutral_upvoted.iterrows():
    print(f"üí¨ {str(row[text_col])[:500]}")
    print(f"   üîπ Sentiment: {row['sentiment_label']} ({row['sentiment_score']:.3f}) | Likes: {row[like_col]}\n")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


df = pd.read_csv(YOUTUBE_SENTIMENT)
df.columns = df.columns.str.lower()


like_col = next((c for c in ["likecount", "score", "likes"] if c in df.columns), None)
if like_col is None:
    raise ValueError("‚ùå No like/score column found in dataset.")


sentiment_stats = df.groupby("sentiment_label")[like_col].agg(["mean", "sum", "count"]).reset_index()
sentiment_stats = sentiment_stats.sort_values("mean", ascending=False)


plt.figure(figsize=(7,4))


color_map = {
    "POSITIVE": "#2ECC71",
    "NEGATIVE": "#E74C3C",
    "NEUTRAL": "#F1C40F"
}


sentiment_stats['sentiment_label'] = pd.Categorical(sentiment_stats['sentiment_label'], ["NEGATIVE", "NEUTRAL", "POSITIVE"])
sentiment_stats = sentiment_stats.sort_values("sentiment_label")

bars = plt.bar(
    sentiment_stats["sentiment_label"],
    sentiment_stats["mean"],
    color=[color_map[label] for label in sentiment_stats["sentiment_label"]]
)
plt.title("Average Likes per Sentiment Category (YouTube)")
plt.ylabel("Average Likes per Comment")
plt.xlabel("Sentiment")
plt.xticks(rotation=0)


for bar in bars:
    plt.text(
        bar.get_x() + bar.get_width()/2,
        bar.get_height() + sentiment_stats["mean"].max()*0.01,
        f"{bar.get_height():.0f}",
        ha="center",
        va="bottom",
        fontsize=9
    )

plt.tight_layout()
plt.show()


print("üìä Sentiment Popularity Summary:")
for _, row in sentiment_stats.iterrows():
    print(f"{row['sentiment_label']}: {row['count']} comments | Avg Likes: {row['mean']:.1f} | Total Likes: {row['sum']:.0f}")

**4.0 Text Frequency Analysis**

This part includes an analysis of the most frequent words in the dataset. From the most frequent or thematically relevant words, the top 10 most liked comments are reviewed, to gain deeper contextual awareness.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

df = pd.read_csv(CLEAN_YOUTUBE_DATA)

text_col = next((c for c in ["text","content","body","comment"] if c in df.columns), None)
if text_col is None:
    raise ValueError("No text column found.")

all_text = " ".join(df[text_col].dropna().astype(str).tolist())

vectorizer = CountVectorizer(stop_words="english", token_pattern=r"\b\w+\b")
X = vectorizer.fit_transform([all_text])

words = vectorizer.get_feature_names_out()
counts = X.sum(axis=0).tolist()[0]

word_counts = pd.DataFrame({"word": words, "count": counts})

custom_stopwords = {"s","t","g","1","m","la","000", 'y', 'don', 've', 'el', 'que', 'did'}
word_counts = word_counts[~word_counts["word"].isin(custom_stopwords)]

top_words = (
    word_counts
    .sort_values("count", ascending=False)
    .head(15)
    .reset_index(drop=True)
)

top_words.index = top_words.index + 1

print("Top 15 Words in the Dataset:")
display(
    top_words.style.set_properties(
        **{
            "border": "1px solid black",
            "text-align": "left",
        }
    )
)


In [None]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 100)

TOP_N_WORDS = 15
TOP_N_COMMENTS = 15
LIKES_COL = "likeCount"

for word in top_words["word"].head(TOP_N_WORDS):
    subset = df[
        df[text_col]
        .fillna("")
        .astype(str)
        .str.contains(rf"\b{word}\b", case=False, regex=True)
    ]

    if subset.empty:
        continue

    cols_to_show = [text_col, LIKES_COL] + (["orig_index"] if "orig_index" in df.columns else [])

    top_comments = (
        subset.sort_values(LIKES_COL, ascending=False)
              .head(TOP_N_COMMENTS)[cols_to_show]
              .reset_index(drop=True)
    )

    print(f"\nüîπ Top {TOP_N_COMMENTS} most-liked comments containing '{word}':\n")
    display(top_comments)


 **5.0 Topic Modelling**

This part conducts topic modelling on the entire dataset using semantic embeddings, which capture the meaning of the word and don‚Äôt rely exclusively on frequency patterns.

In [None]:
!pip install -q bertopic sentence-transformers umap-learn hdbscan nltk

import pandas as pd
import nltk
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords


df_yt = pd.read_csv(CLEAN_YOUTUBE_DATA)
print(f"‚úÖ Loaded cleaned YouTube dataset with {len(df_yt):,} rows")

text_col = next((c for c in ["comment", "text", "content", "body"] if c in df_yt.columns), None)
if text_col is None:
    raise ValueError("‚ùå No text column found (expected one of: comment, text, content, body).")

df_yt[text_col] = df_yt[text_col].fillna("").astype(str)


df_yt = df_yt[df_yt[text_col].str.strip().ne("")].copy()
print(f"üßπ Kept all {len(df_yt):,} comments")

df_yt["char_len"] = df_yt[text_col].str.len()
df_yt["is_short"] = df_yt["char_len"] < 5


# Stopwords
nltk.download("stopwords")

stopwords_multi = (
    set(stopwords.words("english"))
    | set(stopwords.words("spanish"))
    | set(stopwords.words("portuguese"))
    | {
        "https", "http", "www", "com", "jpg", "removed", "deleted", "amp",
        "vs", "victoria", "secret", "victorias", "show",
        "like", "really", "girl", "girls", "omg", "lol",
        "video", "watching", "watched", "youtube"
    }
)
stopwords_multi = list(stopwords_multi)


vectorizer_model = CountVectorizer(
    stop_words=stopwords_multi,
    token_pattern=r"[A-Za-z\u00C0-\u00FF']{3,}",
    min_df=2,
    max_df=0.9,
    ngram_range=(1, 2)
)


embedding_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    language="multilingual",
    min_topic_size=50,
    calculate_probabilities=True,
    verbose=True
)

# Fit model
texts = df_yt[text_col].tolist()
topics, probs = topic_model.fit_transform(texts)
df_yt["bertopic_topic"] = topics


# Output topic overview
pd.set_option("display.max_colwidth", None)
topic_info = topic_model.get_topic_info()

print(f"\nüìä Topic overview: total {len(topic_info)} topics (including -1 outliers)")
print("\n‚úÖ Sanity check (all rows assigned a topic):")
print("Rows in df_yt:", len(df_yt))
print("Topic assignments:", df_yt["bertopic_topic"].notna().sum())
print("\nTopic distribution:")
print(df_yt["bertopic_topic"].value_counts().sort_index())

top_topic_info = topic_info[topic_info["Topic"] != -1].head(15)
print("\nüìä Top 15 non-outlier topics:")
print(top_topic_info.to_string(index=False))

top_topics = top_topic_info["Topic"].tolist()

print("\nüîç Inspecting TOP 15 topics (excluding -1) with 15 example comments each:")
for t in top_topics:
    print("\n" + "=" * 60)
    print(f"üß© Topic {t}")
    print("Top words:", topic_model.get_topic(t))
    print("\nüí¨ Sample comments:")

    sample_comments = df_yt.loc[df_yt["bertopic_topic"] == t, text_col].head(15)
    for i, c in enumerate(sample_comments, start=1):
        print(f"{i:2d}. {c}")


outlier_mask = df_yt["bertopic_topic"] == -1
outlier_count = int(outlier_mask.sum())
print("\nüìå Outlier diagnostics:")
print(f"Outlier (-1) comments: {outlier_count:,} / {len(df_yt):,} ({outlier_count/len(df_yt)*100:.1f}%)")

if outlier_count > 0:
    short_outliers = int(df_yt.loc[outlier_mask, "is_short"].sum())
    print(f"Short (<5 chars) within outliers: {short_outliers:,} / {outlier_count:,} "
          f"({short_outliers/outlier_count*100:.1f}%)")
