<a href="https://colab.research.google.com/github/polash102/Data-Mining_caption-comment-analysis/blob/main/Lab_2_477.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# STEP 1: Install Required Tools
# ===============================
!pip install -U yt-dlp
!git clone https://github.com/egbertbouman/youtube-comment-downloader.git
!pip install ./youtube-comment-downloader
!pip install webvtt-py

fatal: destination path 'youtube-comment-downloader' already exists and is not an empty directory.
Processing ./youtube-comment-downloader
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: youtube-comment-downloader
  Building wheel for youtube-comment-downloader (pyproject.toml) ... [?25l[?25hdone
  Created wheel for youtube-comment-downloader: filename=youtube_comment_downloader-0.1-py3-none-any.whl size=8211 sha256=41e3e8084392e672d0b89f0c9be744f812def3929fdcad7d14378e2713a1329e
  Stored in directory: /root/.cache/pip/wheels/57/ff/30/9f7845d5d5e4ae1465d1898d3e95377e4119a41fe7a189f0c1
Successfully built youtube-comment-downloader
Installing collected packages: youtube-comment-downloader
  Attempting uninstall: youtube-comment-downloader
    Found existing installation: youtube-comment-downloader 0.1
    Uninstalling youtube

In [16]:
# ===============================
# STEP 2: Import Libraries
# ===============================
import os
import subprocess
import json
import pandas as pd
import re
import nltk
import webvtt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [17]:
# ===============================
# STEP 3: Define Functions
# ===============================
def download_captions(url, output_filename):
    try:
        temp_filename = "temp_captions.vtt"
        command = [
            'python3', '-m', 'yt_dlp',
            '--write-auto-sub',
            '--sub-lang', 'en',
            '--skip-download',
            '-o', temp_filename,
            url
        ]
        subprocess.run(command, check=True)

        # Find the actual downloaded file (yt-dlp might append to the filename)
        downloaded_files = [f for f in os.listdir('.') if f.startswith(temp_filename.split('.')[0]) and f.endswith('.vtt')]
        if downloaded_files:
            actual_downloaded_file = downloaded_files[0]
            os.rename(actual_downloaded_file, output_filename)
            print(f"✅ Captions downloaded and renamed to '{output_filename}'")
        else:
            print(f"❌ Error: Could not find downloaded caption file for {url}")


    except subprocess.CalledProcessError as e:
        print(f"❌ Error: {e}")

def download_comments(url, output_filename, limit=200):
    try:
        command = [
            'python3', '-m', 'youtube_comment_downloader',
            '--url', url,
            '--output', output_filename,
            '--limit', str(limit)
        ]
        print(f"Running command: {' '.join(command)}")
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        print("Command stdout:")
        print(result.stdout)
        print("Command stderr:")
        print(result.stderr)
        print(f"✅ Comments downloaded to '{output_filename}'")
    except subprocess.CalledProcessError as e:
        print(f"❌ Error: {e}")
        print("Command stdout:")
        print(e.stdout)
        print("Command stderr:")
        print(e.stderr)


def clean_vtt_file(vtt_input, txt_output="captions_cleaned.txt"):
    lines = []
    with open(vtt_input, "r", encoding="utf-8") as file:
        for line in file:
            if "-->" in line or line.strip() == "" or line.strip().isdigit():
                continue
            lines.append(line.strip())
    with open(txt_output, "w", encoding="utf-8") as out:
        out.write(" ".join(lines))
    print(f"✅ Cleaned VTT saved to '{txt_output}'")

def parse_comments_jsonl(filepath='comments.json'):
    comments = []
    if not os.path.exists(filepath):
        print(f"⚠️ Comment file not found at '{filepath}'")
        return pd.DataFrame(comments)
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    comment = json.loads(line)
                    comments.append({
                        'username': comment.get('author', '').lstrip('@'),
                        'timestamp_text': comment.get('time', ''),
                        'comment_text': comment.get('text', '')
                    })
                except json.JSONDecodeError:
                    print(f"⚠️ Skipping invalid JSON line in {filepath}: {line.strip()}")
    if not comments:
        print(f"⚠️ No valid comments found in '{filepath}'")
    return pd.DataFrame(comments)

def parse_captions_vtt(filepath='captions.vtt'):
    try:
        captions = [caption.text.strip() for caption in webvtt.read(filepath)]
    except:
        captions = []
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if '-->' not in line and line and not line.isdigit() and 'WEBVTT' not in line:
                    captions.append(line)
    full_text = ' '.join(captions)
    sentences = re.split(r'(?<=[.!?]) +', full_text)
    return pd.DataFrame(sentences, columns=['caption_sentence'])

# Text cleaning pipeline
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text.strip()

def tokenize_text(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words and len(word) > 2]

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

def clean_text_pipeline(text):
    return lemmatize_tokens(remove_stopwords(tokenize_text(normalize_text(text))))

In [18]:
# ===============================
# STEP 4: Define YouTube URLs
# ===============================
video_urls = [
    "https://www.youtube.com/watch?v=5GeAORj0Nw0",
    "https://www.youtube.com/watch?v=rmASLb_Yn5Y",
    "https://www.youtube.com/watch?v=jNhQoYt2dsw",
    "https://www.youtube.com/watch?v=GDSf2h9_39I",
    "https://www.youtube.com/watch?v=wuVVclLcjuA",
    "https://www.youtube.com/watch?v=fs8ZveNZQ8g",
    "https://www.youtube.com/watch?v=rjXZzB5bUAo"
]

# ===============================
# STEP 5: Process All 7 Videos
# ===============================
for i, url in enumerate(video_urls, 1):
    print(f"\n====== Processing Video {i} ======\n")

    caption_filename = f"captions_{i}.vtt"
    comment_filename = f"comments_{i}.json"

    download_captions(url, caption_filename)
    download_comments(url, comment_filename)

    # Clean captions if the file exists
    if os.path.exists(caption_filename):
        clean_vtt_file(caption_filename, f"captions_cleaned_{i}.txt")
        # Parse and clean captions
        captions_df = parse_captions_vtt(caption_filename)
        captions_df['cleaned_tokens'] = captions_df['caption_sentence'].apply(clean_text_pipeline)
        # Save cleaned captions
        captions_df.to_csv(f'cleaned_captions_{i}.csv', index=False)
        print(f"✅ Saved cleaned_captions_{i}.csv")
    else:
        print(f"⚠️ Caption file not found for video {i}, skipping caption processing.")


    # Parse and clean comments if the file exists and is not empty
    comments_df = parse_comments_jsonl(comment_filename)
    if not comments_df.empty:
        comments_df['cleaned_tokens'] = comments_df['comment_text'].apply(clean_text_pipeline)
        # Save cleaned comments
        comments_df.to_csv(f'cleaned_comments_{i}.csv', index=False)
        print(f"✅ Saved cleaned_comments_{i}.csv")
    else:
        print(f"⚠️ No comments processed for video {i} (file not found or empty).")


print("\n✅ All 7 videos processed and cleaned successfully!")



✅ Captions downloaded and renamed to 'captions_1.vtt'
Running command: python3 -m youtube_comment_downloader --url https://www.youtube.com/watch?v=5GeAORj0Nw0 --output comments_1.json --limit 200
Command stdout:
Downloading Youtube comments for https://www.youtube.com/watch?v=5GeAORj0Nw0
Downloaded 1 comment(s)
[0.84 seconds] Done!

Command stderr:

✅ Comments downloaded to 'comments_1.json'
✅ Cleaned VTT saved to 'captions_cleaned_1.txt'
✅ Saved cleaned_captions_1.csv
⚠️ No valid comments found in 'comments_1.json'
⚠️ No comments processed for video 1 (file not found or empty).


✅ Captions downloaded and renamed to 'captions_2.vtt'
Running command: python3 -m youtube_comment_downloader --url https://www.youtube.com/watch?v=rmASLb_Yn5Y --output comments_2.json --limit 200
Command stdout:
Downloading Youtube comments for https://www.youtube.com/watch?v=rmASLb_Yn5Y
Downloaded 1 comment(s)
Downloaded 1 comment(s)
Downloaded 2 comment(s)
Downloaded 3 comment(s)
Downloaded 4 comment(s)
D

In [19]:
# ===============================
# STEP 2a: Download NLTK Data
# ===============================
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Task


## Combine captions

### Subtask:
Read the cleaned caption data from the individual CSV files, combine them into a single DataFrame, and save it as a single CSV file.


In [20]:
# Create an empty list to store the DataFrames
all_captions_dfs = []

# Loop through the video numbers from 1 to 7
for i in range(1, 8):
    # Construct the filename
    filename = f'cleaned_captions_{i}.csv'

    # Check if the file exists
    if os.path.exists(filename):
        # Read the CSV file into a DataFrame and append to the list
        df = pd.read_csv(filename)
        all_captions_dfs.append(df)
        print(f"✅ Loaded {filename}")
    else:
        print(f"⚠️ File not found: {filename}, skipping.")

# Concatenate all DataFrames in the list
if all_captions_dfs:
    combined_captions_df = pd.concat(all_captions_dfs, ignore_index=True)

    # Save the combined DataFrame to a new CSV file
    combined_captions_df.to_csv('all_cleaned_captions.csv', index=False)
    print("✅ Combined cleaned captions saved to 'all_cleaned_captions.csv'")
else:
    print("⚠️ No cleaned caption files were found or loaded.")

✅ Loaded cleaned_captions_1.csv
✅ Loaded cleaned_captions_2.csv
✅ Loaded cleaned_captions_3.csv
✅ Loaded cleaned_captions_4.csv
✅ Loaded cleaned_captions_5.csv
✅ Loaded cleaned_captions_6.csv
✅ Loaded cleaned_captions_7.csv
✅ Combined cleaned captions saved to 'all_cleaned_captions.csv'


## Combine raw captions

### Subtask:
Read the raw caption text from the individual VTT files, extract the text, and save it as a single VTT file.


In [21]:
# Initialize an empty string to store the combined raw caption text
combined_raw_captions = ""

# Iterate through the video numbers from 1 to 7
for i in range(1, 8):
    # Construct the raw caption filename
    filename = f'captions_{i}.vtt'

    # Check if the raw caption file exists
    if os.path.exists(filename):
        # Read the content of the current VTT file
        with open(filename, 'r', encoding='utf-8') as f:
            content = f.read()
        # Append the content to the combined raw caption storage
        combined_raw_captions += content + "\n" # Add a newline between files
        print(f"✅ Read raw captions from '{filename}'")
    else:
        print(f"⚠️ Raw caption file not found: '{filename}', skipping.")

# Write the combined raw caption text to a single output VTT file
output_filename = 'all_raw_captions.vtt'
with open(output_filename, 'w', encoding='utf-8') as f:
    f.write(combined_raw_captions)

# Print a confirmation message
print(f"✅ Combined raw captions saved to '{output_filename}'")

✅ Read raw captions from 'captions_1.vtt'
✅ Read raw captions from 'captions_2.vtt'
✅ Read raw captions from 'captions_3.vtt'
✅ Read raw captions from 'captions_4.vtt'
✅ Read raw captions from 'captions_5.vtt'
✅ Read raw captions from 'captions_6.vtt'
✅ Read raw captions from 'captions_7.vtt'
✅ Combined raw captions saved to 'all_raw_captions.vtt'


## Combine comments

### Subtask:
Read the cleaned comment data from the individual CSV files, combine them into a single DataFrame, and save it as a single CSV file.


In [22]:
# Initialize an empty list to store the DataFrames
all_comments_dfs = []

# Loop through the video numbers from 1 to 7
for i in range(1, 8):
    # Construct the filename for the cleaned comments CSV
    filename = f'cleaned_comments_{i}.csv'

    # Check if the file exists
    if os.path.exists(filename):
        # Read the CSV file into a pandas DataFrame and append to the list
        try:
            df = pd.read_csv(filename)
            all_comments_dfs.append(df)
            print(f"✅ Loaded {filename}")
        except pd.errors.EmptyDataError:
            print(f"⚠️ File is empty: {filename}, skipping.")
        except FileNotFoundError:
             print(f"⚠️ File not found: {filename}, skipping.") # Redundant due to os.path.exists, but kept for robustness
        except Exception as e:
            print(f"❌ Error loading {filename}: {e}")
    else:
        print(f"⚠️ File not found: {filename}, skipping.")


# After the loop, check if the list of DataFrames is not empty
if all_comments_dfs:
    # Concatenate all DataFrames in the list into a single DataFrame
    combined_comments_df = pd.concat(all_comments_dfs, ignore_index=True)

    # Save the combined DataFrame to a new CSV file
    output_filename = 'all_cleaned_comments.csv'
    combined_comments_df.to_csv(output_filename, index=False)
    print(f"✅ Combined cleaned comments saved to '{output_filename}'")
else:
    print("⚠️ No cleaned comment files were found or loaded.")

⚠️ File not found: cleaned_comments_1.csv, skipping.
✅ Loaded cleaned_comments_2.csv
✅ Loaded cleaned_comments_3.csv
✅ Loaded cleaned_comments_4.csv
✅ Loaded cleaned_comments_5.csv
✅ Loaded cleaned_comments_6.csv
✅ Loaded cleaned_comments_7.csv
✅ Combined cleaned comments saved to 'all_cleaned_comments.csv'


## Combine raw comments (json)

### Subtask:
Read the raw comment data from the individual JSON files and combine them into a single JSON file.


In [23]:
# Initialize an empty list to store the combined comment data
all_raw_comments = []

# Loop through the video numbers from 1 to 7
for i in range(1, 8):
    # Construct the filename for the raw comments JSON file
    filename = f'comments_{i}.json'

    # Check if the file exists
    if os.path.exists(filename):
        print(f"Processing {filename}")
        # Open and read each line of the JSON file
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                # If the line is not empty, try to parse it as a JSON object
                if line:
                    try:
                        comment = json.loads(line)
                        # Append the parsed JSON object to the initialized list
                        all_raw_comments.append(comment)
                    except json.JSONDecodeError:
                        # Handle potential JSONDecodeError
                        print(f"⚠️ Skipping invalid JSON line in {filename}: {line}")
    else:
        print(f"⚠️ File not found: {filename}, skipping.")

# After the loop, if the list of combined comment data is not empty
if all_raw_comments:
    # Write the entire list of dictionaries as a JSON array to a new output file
    output_filename = 'all_raw_comments.json'
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(all_raw_comments, f, indent=4) # Use indent=4 for readability

    # Print a confirmation message
    print(f"✅ Combined raw comments saved to '{output_filename}'")
else:
    # Print a message if no comment files were found or processed
    print("⚠️ No raw comment files were found or processed.")

Processing comments_1.json
Processing comments_2.json
Processing comments_3.json
Processing comments_4.json
Processing comments_5.json
Processing comments_6.json
Processing comments_7.json
✅ Combined raw comments saved to 'all_raw_comments.json'


## Combine raw comments (txt)

### Subtask:
Read the raw comment text from the individual JSON files and save the comment text into a single TXT file.


In [24]:
# Initialize an empty list to store the raw comment text
all_raw_comment_texts = []

# Loop through the video numbers from 1 to 7
for i in range(1, 8):
    # Construct the filename for the raw comments JSON file for the current video number
    filename = f'comments_{i}.json'

    # Check if the file exists. If it doesn't, print a warning and continue to the next iteration.
    if not os.path.exists(filename):
        print(f"⚠️ Raw comment file not found: '{filename}', skipping.")
        continue

    print(f"Processing raw comments from '{filename}'")
    # If the file exists, open and read each line of the JSON file.
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            # For each line, if it's not empty, try to parse it as a JSON object.
            if line:
                try:
                    comment = json.loads(line)
                    # If parsing is successful, extract the value associated with the 'text' key
                    # from the parsed JSON object and append it to the list.
                    # Handle potential KeyError if the 'text' key is not present.
                    comment_text = comment.get('text')
                    if comment_text:
                        all_raw_comment_texts.append(comment_text)
                except json.JSONDecodeError:
                    # Handle potential json.JSONDecodeError if a line cannot be parsed
                    # as a JSON object by printing a warning and skipping the line.
                    print(f"⚠️ Skipping invalid JSON line in {filename}: {line}")
                except KeyError:
                    # Handle case where 'text' key is missing, though unlikely based on downloader
                    print(f"⚠️ Skipping line in {filename} with missing 'text' key: {line}")

# After the loop, check if the list of comment text is not empty.
if all_raw_comment_texts:
    # Join all the extracted comment texts with a newline character in between.
    combined_text = "\n".join(all_raw_comment_texts)

    # Write the combined text to a new output file named 'all_raw_comments.txt' using UTF-8 encoding.
    output_filename = 'all_raw_comments.txt'
    with open(output_filename, 'w', encoding='utf-8') as out:
        out.write(combined_text)

    # Print a confirmation message indicating that the combined raw comments were saved to the output file.
    print(f"✅ Combined raw comment texts saved to '{output_filename}'")
else:
    # If the list of comment text is empty, print a message indicating that no raw comment text was found or processed.
    print("⚠️ No raw comment text found or processed from the input files.")

Processing raw comments from 'comments_1.json'
Processing raw comments from 'comments_2.json'
Processing raw comments from 'comments_3.json'
Processing raw comments from 'comments_4.json'
Processing raw comments from 'comments_5.json'
Processing raw comments from 'comments_6.json'
Processing raw comments from 'comments_7.json'
✅ Combined raw comment texts saved to 'all_raw_comments.txt'
