In [55]:
!pip install dotenv google-api-python-client openai torch nltk banglanltk swifter transformers openpyxl emoji --q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for swifter (setup.py) ... [?25l[?25hdone


*****
## Necessary Imports

In [56]:
import os
import pandas as pd
import numpy as np
import random
import time
import re
import json
import matplotlib.pyplot as plt
import seaborn as sns
from openai import OpenAI
import nltk
import banglanltk
import torch
import swifter
import emoji

nltk.download('stopwords')

from nltk.corpus import stopwords
from dotenv import load_dotenv
from googleapiclient.discovery import build
from transformers import AutoModelForCausalLM, AutoTokenizer

# custom modules
from bangla_stopwords import bangla_stopwords as bn_stopwords
from slang_text import slang_text_dict

# warnings
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
load_dotenv()
youtube_api_key = os.getenv("YOUTUBE_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

*****
## Fetch Comments from Youtube Video via Youtube API

In [None]:
def get_youtube_comments(video_id, max_comments=1000, api_key=youtube_api_key):
    """ Fetch comments from a youtube video via youtube api key """
    youtube = build('youtube', 'v3', developerKey=api_key)
    comments = []
    next_page_token = None

    while True:
        if len(comments) >= max_comments:
            print(f"Reached max comments limit: {max_comments}")
            break

        request = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=100, # max results per request
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response.get('items', []):
            comment_snippet = item['snippet']['topLevelComment']['snippet']

            comments.append({
                "comment_id": item["snippet"]["topLevelComment"]["id"],
                "comment": comment_snippet["textDisplay"]
            })

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    print(f"Collected {len(comments)} comments from video ID: {video_id}")
    return comments

In [None]:
# List of YouTube video IDs to fetch comments from
video_ids = [
    "_vXejyrQ-LM",
    "uKM3hEbLEOg",
    "3EVTSc42fhM",
    "YykjpeuMNEk",
    "Y2xgEEhtTAM",
    "tn3dTbE1d4I",
    "OQ57q3lDlBA",
    "ecq0id1uMcU"
]

all_comments = []

# fetch comments from each video
for video in video_ids:
    comments = get_youtube_comments(video, max_comments=4000)
    all_comments.extend(comments)

print("Total Comments Fetched:", len(all_comments))

Collected 487 comments from video ID: _vXejyrQ-LM
Reached max comments limit: 4000
Collected 4000 comments from video ID: uKM3hEbLEOg
Reached max comments limit: 4000
Collected 4000 comments from video ID: 3EVTSc42fhM
Reached max comments limit: 4000
Collected 4000 comments from video ID: YykjpeuMNEk
Reached max comments limit: 4000
Collected 4000 comments from video ID: Y2xgEEhtTAM
Reached max comments limit: 4000
Collected 4000 comments from video ID: tn3dTbE1d4I
Collected 965 comments from video ID: OQ57q3lDlBA
Reached max comments limit: 4000
Collected 4000 comments from video ID: ecq0id1uMcU
Total Comments Fetched: 25452


In [None]:
os.makedirs("./data", exist_ok=True)
df = pd.DataFrame(all_comments)
df.to_csv("./data/youtube_comments.csv", index=False, encoding="utf-8")

In [None]:
for x in range(10):
    print(random.choice(df['comment'].values))

এই ভিডিওটা দেখেই ওদের ভিডিও দেখা শুরু করছিলাম।
🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣😂😂😂😂😂😂😂😂😂😂😂
আমি!  হ্যা এটাই আমি
Do you hear my husband??? 😂😂😂😂 Xoxo
❤❤❤❤❤❤❤❤
যতবার দেখি ততবার ভালো লাগে !
Sound কম
চোখে পানি এসে গেলো 😢😢😢😢😢😢😢😢😢😢
Purai pankha
কিছু মানুষ মরে যায় পঁচিশের আগেই💔


*****
## Clean the Dataset before Labeling

In [None]:
df = pd.read_csv("./data/youtube_comments.csv", encoding="utf-8")
df.head()

Unnamed: 0,comment_id,comment
0,UgwjLSKgVwqG3Vy2csB4AaABAg,"গানটা যখন শুনছি, তখনই এই শহরটা ছাড়ার ঘন্টা বেঁ..."
1,Ugz4sENWkAzW5NUVV014AaABAg,tnx gan ta onak sundor.we hope that amon sundo...
2,Ugzn3pLrMMbWBLcxnPx4AaABAg,Ei shohor amar r noy 💔💔
3,UgxEIk3qaNEzZwceWf54AaABAg,"যারা নিজের এলাকা ছেড়ে দূরে চলে গেছে, তাদের জন্..."
4,UgwESidmpzX8YPYaKfR4AaABAg,"আজ ২২-১০-২৫ স্মীতি রেখে গেলাম আপনাদের মাঝে,,,য..."


In [None]:
df.drop(columns=["comment_id"], inplace=True)

In [None]:
def remove_emojis(text):
    """ Remove emojis from text """
    return emoji.replace_emoji(text, replace='')

In [None]:
def remove_html_tags(data):
    """ Remove html tags from text """
    pattern = re.compile('<.*?>')
    return pattern.sub("", data)

In [None]:
def remove_urls(data):
    """ Remove URLs from text """
    pattern = re.compile(r'https?://[^\s]+|www\.[^\s]+')
    return pattern.sub("", data)

In [None]:
def replace_slang(text, slang_dict=slang_text_dict):
    """ Replace slang words in text using the provided slang dictionary """
    new_text = []

    for word in text.lower().split():
        if word in slang_dict:
            new_text.append(slang_dict[word])
        else:
            new_text.append(word)
    return " ".join(new_text)

In [None]:
df['clean_comment'] = df['comment'].apply(remove_emojis).apply(remove_html_tags).apply(remove_urls)

In [None]:
df.sample(10)

Unnamed: 0,comment,clean_comment
11915,Next year 5 B viewers ❤,Next year 5 B viewers
22359,Toma Der Jai Clock Achi Oirokom Ama Diru Achi ...,Toma Der Jai Clock Achi Oirokom Ama Diru Achi ...
187,এক কথায় গানটা অনবদ্য সৃষ্টি...গানের গভীরতা ভীষ...,এক কথায় গানটা অনবদ্য সৃষ্টি...গানের গভীরতা ভীষ...
15929,অনেক জায়গায় দাদু দিদা 100 বছর বয়সএর পর মারা...,অনেক জায়গায় দাদু দিদা 100 বছর বয়সএর পর মারা...
4321,ভালো লাগসিলনা এই গান টা সুনতে আস্লাম,ভালো লাগসিলনা এই গান টা সুনতে আস্লাম
19185,Nice Kiran da 😍😍,Nice Kiran da
20570,ভাই আপনাদের সব গুলা video অস্থির।,ভাই আপনাদের সব গুলা video অস্থির।
8891,coldplay: do we have brendon,coldplay: do we have brendon
8288,গান টা আমার প্রিয় গান <br>এখন তো ১২:৩০™এসে গান...,গান টা আমার প্রিয় গান এখন তো ১২:৩০এসে গান শুনল...
14401,Hi sir 🖐️,Hi sir


In [None]:
for index, row in df.iterrows():
    if str(row['clean_comment']).strip() == "":
        df.at[index, 'clean_comment'] = np.nan

In [None]:
df.isnull().sum()

comment             0
clean_comment    2496
dtype: int64

In [None]:
df.dropna(subset=['clean_comment'], inplace=True)
df.isnull().sum()

comment          0
clean_comment    0
dtype: int64

In [None]:
df.duplicated().sum()

np.int64(1558)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

(21398, 2)

In [None]:
df.sample(10)

Unnamed: 0,comment,clean_comment
2505,মরে গেছি 😢 বয়স ২৫,মরে গেছি বয়স ২৫
12956,Dada amio tomar khelar moddhe aste chaii <a hr...,Dada amio tomar khelar moddhe aste chaii
23122,sotty onk funny,sotty onk funny
10630,Finally i found that song my favourite song ❤❤❤❤,Finally i found that song my favourite song
12182,❤june 10 2015❤❤❤,june 10 2015
21637,2024 সালে দেখতিছি।,2024 সালে দেখতিছি।
17750,Model activity task niye vedio koro,Model activity task niye vedio koro
20784,হিন্দি গানের সুর ছাড়া আমাদের দেশের কোন ভাল গা...,হিন্দি গানের সুর ছাড়া আমাদের দেশের কোন ভাল গা...
21029,hihihihihi joss,hihihihihi joss
17500,Somaj birodhi 😄😄😄,Somaj birodhi


In [None]:
df['clean_comment'] = df['clean_comment'].apply(replace_slang)

In [None]:
df.sample(10)

Unnamed: 0,comment,clean_comment
4443,অনেক আগেই মরে গেছি 😭,অনেক আগেই মরে গেছি
14295,Her westbengal Newafeed,her westbengal newafeed
17740,I love you kiran dada,i love you kiran dada
4966,কি বলি ভাই ডিপ্রেশন এর ঔষধ ছাড়া চলে না😅,কি বলি ভাই ডিপ্রেশন এর ঔষধ ছাড়া চলে না
19318,Ami first e কচ্ছপ এর মতো লিখি R sese r dike খ...,ami first e কচ্ছপ এর মতো লিখি are or our sese ...
22067,"<a href=""https://www.youtube.com/watch?v=ecq0i...",0:37
8400,গানটা শুনে মনে হচ্ছে যদি আবার ফিরে পেতাম সেই শ...,গানটা শুনে মনে হচ্ছে যদি আবার ফিরে পেতাম সেই শ...
5621,কে কে টিকটক থেকে শুনতে আসছেন...?😊,কে কে টিকটক থেকে শুনতে আসছেন...?
5757,আহারে শৈশব &quot;<br>হারিয়ে বেলা শৈশবের খোঁজে 💔💔,আহারে শৈশব &quot;হারিয়ে বেলা শৈশবের খোঁজে
13855,Sotti dada haste haste morei jabo ebar 😂😂😂,sotti dada haste haste morei jabo ebar


In [None]:
df.drop(columns=["comment"], inplace=True)
df.rename(columns={"clean_comment": "comment"}, inplace=True)

In [None]:
df.to_csv("./data/clean_youtube_comments.csv", index=False, encoding="utf-8")

*****
*****

*****
## Label the Dataset with HuggingFace Qwen2.5 72B Model via RAG Approach

In [61]:
df = pd.read_csv("./clean_youtube_comments.csv")
df.head()

Unnamed: 0,comment
0,"গানটা যখন শুনছি, তখনই এই শহরটা ছাড়ার ঘন্টা বেঁ..."
1,tnx gan ta onak sundor.we hope that amon sundo...
2,ei shohor amar are or our noy
3,"যারা নিজের এলাকা ছেড়ে দূরে চলে গেছে, তাদের জন্..."
4,"আজ ২২-১০-২৫ স্মীতি রেখে গেলাম আপনাদের মাঝে,,,য..."


In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {torch.cuda.get_device_name() if device else "CPU"}")

Device: NVIDIA A100-SXM4-80GB


In [6]:
# load model
model_name = "Qwen/Qwen2.5-14B-Instruct"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    torch_dtype=torch.bfloat16
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/3.98G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/1.70G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/3.89G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [9]:
def label_comment(comment_text):
    """ Label the comments with type and sentiment using Qwen2.5 14B model """
    try:
        prompt = f"""
        Analyze the following YouTube comment and provide two things:
        1. Type - Describe the *Kind* of comment it is (choose one from: opinion, emotional, question, informative, promotional)
        2. Sentiment - Describe the *Sentiment* of the comment (choose one from: positive, negative, neutral)

        Return the response strictly in JSON format as:
        {{
            "Type": "<Kind of comment>",
            "Sentiment": "<Sentiment of comment>"
        }}

        Comment: {comment_text}
        Note: The comment can be in any language (e.g., Bengali, English, Romanize Bangla, etc), so analyze accordingly.
        """
        messages = [
            {"role": "system", "content": "You are Qwen, a multilingual sentiment analysis expert."},
            {"role": 'user', "content": prompt}
        ]

        input = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        model_inputs = tokenizer([input], return_tensors='pt').to(device)

        # Get raw response
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=128
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # print("\nRaw Response:", response[0], "\n")   # Debug

        # --- Now parse JSON ---
        result = json.loads(response[0])

        return result['Type'], result['Sentiment']

    except Exception as e:
        print(f"Error Labeling Comment: {e}")
        return None, None

In [63]:
sample = str(df.sample(1)['comment'].values[0])
print("Sample Comment to Labeling:", sample)
print("Label:", label_comment(sample))

Sample Comment to Labeling: 0:37 my favourite line
Label: ('opinion', 'positive')


In [69]:
swifter.config.progress_bar = True
start = time.time()

df[['type', 'sentiment']] = df['comment'].swifter.apply(
    lambda x: pd.Series(label_comment(x))
)

print(f"Total Processing Time: {((time.time() - start)/60):.2f} minutes")

Pandas Apply:   0%|          | 0/21398 [00:00<?, ?it/s]

Error Labeling Comment: Expecting value: line 1 column 1 (char 0)
Error Labeling Comment: Extra data: line 6 column 1 (char 59)
Error Labeling Comment: Expecting value: line 1 column 1 (char 0)
Error Labeling Comment: Extra data: line 6 column 1 (char 57)
Error Labeling Comment: Expecting value: line 1 column 1 (char 0)
Error Labeling Comment: Extra data: line 6 column 1 (char 57)
Error Labeling Comment: Expecting value: line 1 column 1 (char 0)
Error Labeling Comment: Extra data: line 6 column 1 (char 59)
Error Labeling Comment: Extra data: line 6 column 1 (char 59)
Error Labeling Comment: Extra data: line 6 column 1 (char 53)
Error Labeling Comment: Extra data: line 6 column 1 (char 55)
Error Labeling Comment: Extra data: line 6 column 1 (char 57)
Error Labeling Comment: Extra data: line 6 column 1 (char 59)
Error Labeling Comment: Expecting value: line 1 column 1 (char 0)
Error Labeling Comment: Extra data: line 6 column 1 (char 57)
Error Labeling Comment: Extra data: line 6 column 

*****
## Saving the Final Dataset

In [73]:
df.sample(10)

Unnamed: 0,comment,type,sentiment
15623,you sho the pendent your wading pandent,informative,neutral
4843,যার কন্টে গান গাওয়া তাকে ধন্য বাদ,emotional,positive
10801,kokhono choto chele bole tar video dekhini,informative,neutral
9917,india,informative,neutral
8822,sumon thaka taka piymuo.,emotional,neutral
8594,yesterday listened live at wembley london,informative,neutral
15508,darun hoye6e vai....,emotional,positive
8517,i love india from mars,opinion,positive
10516,9 june 2025,informative,neutral
2919,জীবনের সেরা একটা গান,opinion,positive


In [74]:
df.to_csv('./youtube_comment_dataset.csv', encoding='utf-8')