In [None]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


def setup_driver():
    """Setup Chrome WebDriver with options."""
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--start-maximized")
    # Uncomment the following line to run in headless mode
    # options.add_argument("--headless")
    return webdriver.Chrome(options=options)

def wait_for_element(driver, by, value, timeout=15):
    """Wait for an element to be present on the page."""
    return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))


def scrape_youtube_video_data(driver, url):
    """Scrape YouTube video data including likes, views, and people mentioned."""
    metrics = {"Likes": "Not Found", "Views": "Not Found", "People_Mentioned": []}
    try:
        print(f"Processing URL: {url}")
        driver.get(url)
        time.sleep(5)  # Wait for the page to load

        # Accept cookies if prompted
        try:
            cookie_button = wait_for_element(driver, By.XPATH, '//*[@id="content"]/div[2]/div[6]/div[1]/ytd-button-renderer[1]/yt-button-shape/button/yt-touch-feedback-shape/div/div[2]', timeout=10)
            cookie_button.click()
            print("Cookies accepted.")
            time.sleep(2)
        except Exception:
            print("No cookies prompt or already accepted.")

        # Get likes
        try:
            likes_element = wait_for_element(driver, By.XPATH, '//*[@id="top-level-buttons-computed"]/segmented-like-dislike-button-view-model/yt-smartimation/div/div/like-button-view-model/toggle-button-view-model/button-view-model/button/div[2]', 15)
            metrics["Likes"] = likes_element.text
            print(f"Likes: {metrics['Likes']}")
        except Exception as e:
            print(f"Could not fetch likes: {e}")

        # Get views
        try:
            views_element = wait_for_element(driver, By.CSS_SELECTOR, '.style-scope.yt-formatted-string.bold[style-target="bold"]', 15)
            metrics["Views"] = views_element.text
            print(f"Views: {metrics['Views']}")
        except Exception as e:
            print(f"Could not fetch views: {e}")

        # Expand and retrieve names of people mentioned
        try:
            expand_button = wait_for_element(driver, By.XPATH, '//*[@id="expand"]', timeout=10)
            expand_button.click()
            print("'Expand' button clicked. Scrolling down to reveal names.")
            time.sleep(2)  # Allow time for content to expand

            # Scroll down to ensure all names are loaded
            driver.execute_script("window.scrollBy(0, 500);")
            time.sleep(2)

            # Retrieve names of people mentioned
            index = 1
            while True:
                try:
                    person_xpath = f'//*[@id="items"]/yt-video-attributes-section-view-model/div/div[2]/div/yt-video-attribute-view-model[{index}]/div/a/div[2]/h1'
                    person_element = wait_for_element(driver, By.XPATH, person_xpath, timeout=5)
                    person_name = person_element.text.strip()
                    metrics["People_Mentioned"].append(person_name)
                    print(f"Person {index}: {person_name}")
                    index += 1
                except Exception:
                    print(f"Finished retrieving people mentioned. Total: {len(metrics['People_Mentioned'])}")
                    break
        except Exception as e:
            print(f"Error retrieving people mentioned: {e}")

    except Exception as e:
        print(f"Error processing URL {url}: {e}")

    return metrics


def fetch_transcript(episode_url):
    """Fetch transcript from the given URL."""
    print(f"Fetching transcript for {episode_url}...")
    try:
        response = requests.get(episode_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        transcript_tag = soup.find("p", class_="chakra-text ssc-transcript css-0")
        transcript = transcript_tag.get_text(strip=True) if transcript_tag else "Transcript not available."

        # Preprocess the transcript
        return preprocess_transcript(transcript)
    except Exception as e:
        print(f"Error fetching transcript for {episode_url}: {e}")
        return "Transcript not available."


def preprocess_transcript(transcript):
    """Preprocess transcript by cleaning, lemmatizing, and tokenizing."""
    if transcript == "Transcript not available.":
        return transcript

    # Text Cleaning: Remove punctuation, stopwords, and lemmatize
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    transcript = re.sub(r'[^a-zA-Z\s]', '', transcript)  # Remove punctuation
    words = word_tokenize(transcript.lower())
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    cleaned_transcript = ' '.join(cleaned_words)

    # Sentiment Analysis
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(transcript)

    return {
        "cleaned_transcript": cleaned_transcript,
        "sentiment": sentiment
    }


def process_youtube_data(input_file, output_file, json_output_file):
    """Process YouTube video URLs from an Excel file, save scraped data, and generate JSON."""
    df = pd.read_excel(input_file)
    driver = setup_driver()
    processed_data = []

    try:
        for index, row in df.iterrows():  # Process the full DataFrame
            video_url = row['url']
            generated_url = row['generated_url']
            print(f"Processing video: {row['name']}")
            try:
                metrics = scrape_youtube_video_data(driver, video_url)
                transcript_data = fetch_transcript(generated_url)

                # Save the scraped data to the DataFrame
                df.at[index, 'Likes'] = metrics["Likes"]
                df.at[index, 'Views'] = metrics["Views"]
                df.at[index, 'People_Mentioned'] = ", ".join(metrics["People_Mentioned"])

                # Append data for JSON file
                processed_data.append({
                    "Episode Title": row['name'],
                    "Cleaned_Transcript": transcript_data["cleaned_transcript"] if isinstance(transcript_data, dict) else transcript_data,
                    "people_mentioned": metrics["People_Mentioned"],
                    "views": metrics["Views"],
                    "likes": metrics["Likes"],
                    "guest": row.get('guest', 'Unknown'),
                    "sentiment": transcript_data.get("sentiment") if isinstance(transcript_data, dict) else {}
                })
                print(f"Successfully processed video: {row['name']}")
            except Exception as e:
                print(f"Error processing video {row['name']}: {e}")

        # Save the updated DataFrame to an Excel file
        df.to_excel(output_file, index=False)
        print(f"Scraped data saved to {output_file}")

        # Save the processed data to a JSON file
        with open(json_output_file, 'w') as json_file:
            json.dump(processed_data, json_file, indent=4)
        print(f"Processed data saved to {json_output_file}")
    except Exception as e:
        print(f"Error during processing: {e}")
    finally:
        driver.quit()
        print("WebDriver closed.")


if __name__ == "__main__":
    input_file = "jre_episodes.xlsx"
    output_file = "jre_episodes_with_metrics.xlsx"
    json_output_file = "jre_episodes_data.json"
    process_youtube_data(input_file, output_file, json_output_file)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: '/Users/italorobello/Desktop/Social Graphs/Final assignment/jre_episodes.xlsx'

In [None]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


def setup_driver():
    """Setup Chrome WebDriver with options."""
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--start-maximized")
    # Uncomment the following line to run in headless mode
    # options.add_argument("--headless")
    return webdriver.Chrome(options=options)


def wait_for_element(driver, by, value, timeout=15):
    """Wait for an element to be present on the page."""
    return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))


def scrape_youtube_video_data(driver, url):
    """Scrape YouTube video data including likes, views, and people mentioned."""
    metrics = {"Likes": "Not Found", "Views": "Not Found", "People_Mentioned": []}
    try:
        print(f"Processing URL: {url}")
        driver.get(url)
        time.sleep(5)  # Wait for the page to load

        # Accept cookies if prompted
        try:
            cookie_button = wait_for_element(driver, By.XPATH, '//*[@id="content"]/div[2]/div[6]/div[1]/ytd-button-renderer[1]/yt-button-shape/button/yt-touch-feedback-shape/div/div[2]', timeout=10)
            cookie_button.click()
            print("Cookies accepted.")
            time.sleep(2)
        except Exception:
            print("No cookies prompt or already accepted.")

        # Get likes
        try:
            likes_element = wait_for_element(driver, By.XPATH, '//*[@id="top-level-buttons-computed"]/segmented-like-dislike-button-view-model/yt-smartimation/div/div/like-button-view-model/toggle-button-view-model/button-view-model/button/div[2]', 15)
            metrics["Likes"] = likes_element.text
            print(f"Likes: {metrics['Likes']}")
        except Exception as e:
            print(f"Could not fetch likes: {e}")

        # Get views
        try:
            views_element = wait_for_element(driver, By.CSS_SELECTOR, '.style-scope.yt-formatted-string.bold[style-target="bold"]', 15)
            metrics["Views"] = views_element.text
            print(f"Views: {metrics['Views']}")
        except Exception as e:
            print(f"Could not fetch views: {e}")

        # Expand and retrieve names of people mentioned
        try:
            expand_button = wait_for_element(driver, By.XPATH, '//*[@id="expand"]', timeout=10)
            expand_button.click()
            print("'Expand' button clicked. Scrolling down to reveal names.")
            time.sleep(2)  # Allow time for content to expand

            # Scroll down to ensure all names are loaded
            driver.execute_script("window.scrollBy(0, 500);")
            time.sleep(2)

            # Retrieve names of people mentioned
            index = 1
            while True:
                try:
                    person_xpath = f'//*[@id="items"]/yt-video-attributes-section-view-model/div/div[2]/div/yt-video-attribute-view-model[{index}]/div/a/div[2]/h1'
                    person_element = wait_for_element(driver, By.XPATH, person_xpath, timeout=5)
                    person_name = person_element.text.strip()
                    metrics["People_Mentioned"].append(person_name)
                    print(f"Person {index}: {person_name}")
                    index += 1
                except Exception:
                    print(f"Finished retrieving people mentioned. Total: {len(metrics['People_Mentioned'])}")
                    break
        except Exception as e:
            print(f"Error retrieving people mentioned: {e}")

    except Exception as e:
        print(f"Error processing URL {url}: {e}")

    return metrics


def fetch_transcript(episode_url):
    """Fetch transcript from the given URL."""
    print(f"Fetching transcript for {episode_url}...")
    try:
        response = requests.get(episode_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        transcript_tag = soup.find("p", class_="chakra-text ssc-transcript css-0")
        transcript = transcript_tag.get_text(strip=True) if transcript_tag else "Transcript not available."

        # Preprocess the transcript
        return preprocess_transcript(transcript)
    except Exception as e:
        print(f"Error fetching transcript for {episode_url}: {e}")
        return "Transcript not available."


def preprocess_transcript(transcript):
    """Preprocess transcript by cleaning, lemmatizing, and tokenizing."""
    if transcript == "Transcript not available.":
        return transcript

    # Text Cleaning: Remove punctuation, stopwords, and lemmatize
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    transcript = re.sub(r'[^a-zA-Z\s]', '', transcript)  # Remove punctuation
    words = word_tokenize(transcript.lower())
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    cleaned_transcript = ' '.join(cleaned_words)

    # Sentiment Analysis
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(transcript)

    return {
        "cleaned_transcript": cleaned_transcript,
        "sentiment": sentiment
    }


def process_youtube_data(input_file, output_file, json_output_file):
    """Process YouTube video URLs from an Excel file, save scraped data, and generate JSON."""
    df = pd.read_excel(input_file)
    driver = setup_driver()
    processed_data = []

    try:
        for index, row in df.iterrows():  
            video_url = row['url']
            generated_url = row['generated_url']
            print(f"Processing video: {row['name']}")
            try:
                metrics = scrape_youtube_video_data(driver, video_url)
                transcript_data = fetch_transcript(generated_url)

                # Save the scraped data to the DataFrame
                df.at[index, 'Likes'] = metrics["Likes"]
                df.at[index, 'Views'] = metrics["Views"]
                df.at[index, 'People_Mentioned'] = ", ".join(metrics["People_Mentioned"])

                # Append data for JSON file
                processed_data.append({
                    "Episode Title": row['name'],
                    "Cleaned_Transcript": transcript_data["cleaned_transcript"] if isinstance(transcript_data, dict) else transcript_data,
                    "people_mentioned": metrics["People_Mentioned"],
                    "views": metrics["Views"],
                    "likes": metrics["Likes"],
                    "guest": row.get('guest', 'Unknown'),
                    "sentiment": transcript_data.get("sentiment") if isinstance(transcript_data, dict) else {}
                })
                print(f"Successfully processed video: {row['name']}")
            except Exception as e:
                print(f"Error processing video {row['name']}: {e}")

        # Save the updated DataFrame to an Excel file
        df.to_excel(output_file, index=False)
        print(f"Scraped data saved to {output_file}")

        # Save the processed data to a JSON file
        with open(json_output_file, 'w') as json_file:
            json.dump(processed_data, json_file, indent=4)
        print(f"Processed data saved to {json_output_file}")
    except Exception as e:
        print(f"Error during processing: {e}")
    finally:
        driver.quit()
        print("WebDriver closed.")


if __name__ == "__main__":
    input_file = "jre_episodes.xlsx"
    output_file = "jre_episodes_with_metrics.xlsx"
    json_output_file = "jre_episodes_data.json"
    process_youtube_data(input_file, output_file, json_output_file)


In [None]:
import json
from collections import Counter

# Load processed transcripts
def load_transcripts(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

# Count named entities
def count_named_entities(transcripts):
    entity_counter = Counter()

    for episode in transcripts:
        entities = episode.get("Named_Entities", [])
        entity_counter.update(entities)  # Add the entities from each episode to the counter

    return entity_counter

# Save entity counts to a file in descending order
def save_entity_counts(entity_counts, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        for entity, count in entity_counts.most_common():  # Automatically sorted in descending order
            f.write(f"{entity}: {count}\n")
    print(f"Entity counts saved to {output_file}")

# Main function
def main():
    input_file = "processed_transcripts.json"  # Update with your file path
    output_file = "entity_counts_sorted.txt"

    transcripts = load_transcripts(input_file)
    entity_counts = count_named_entities(transcripts)

    print(f"Total unique named entities: {len(entity_counts)}")
    print(f"Top 10 entities:\n{entity_counts.most_common(10)}")  # Print top 10 entities to console

    save_entity_counts(entity_counts, output_file)

if __name__ == "__main__":
    main()


Total unique named entities: 9459
Top 10 entities:
[('Jamie', 814), ('Joe', 646), ('Dude', 471), ('Jesus', 415), ('COVID', 398), ('Twitter', 318), ('Trump', 277), ('Tony', 241), ('Biden', 222), ('Jesus Christ', 221)]
Entity counts saved to entity_counts_sorted.txt


In [None]:
import re
from collections import defaultdict



# Load entity counts from file
def load_entity_counts(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    entities = {}
    for line in lines:
        match = re.match(r"(.+): (\d+)", line.strip())  # Match "Entity: Count"
        if match:
            entity, count = match.groups()
            entities[entity.strip()] = int(count)
    return entities

# Define custom corrections for specific cases
custom_corrections = {
    "mike tyson": "Myke Tyson",
    "this mike tyson": "Myke Tyson",
    "the david carrot": "David Carrot"
}

# Normalize entity name for grouping
def normalize_entity(entity):
    """
    Normalize entity names by:
    - Removing prefixes like "a", "an", "the", "this" (e.g., "the David Carrot" -> "David Carrot")
    - Removing possessive forms (e.g., "Donald's" -> "Donald")
    - Removing trailing 's' where appropriate
    - Ensuring names include at least a first and last name
    - Applying custom corrections
    """
    entity = entity.lower().strip()
    entity = re.sub(r"^(a|an|the|this)\s+", "", entity)  # Remove prefixes
    entity = re.sub(r"'s$", "", entity)  # Remove possessive forms
    entity = re.sub(r"s$", "", entity)  # Remove trailing 's' for plurals
    entity = re.sub(r"[^\w\s]", "", entity)  # Remove non-alphanumeric characters

    # Apply custom corrections if available
    if entity in custom_corrections:
        return custom_corrections[entity]

    # Check if the entity includes at least a first and last name
    if len(entity.split()) < 2:  # Remove names with fewer than 2 words
        return None

    # Capitalize each word in the name for consistency
    return entity.title()

# Group similar entities and create mapping
def create_name_mapping(entities):
    name_mapping = {}
    grouped_entities = defaultdict(list)

    for entity in entities.keys():
        normalized = normalize_entity(entity)  # Normalize name
        if normalized:  # Skip if None (e.g., single names)
            grouped_entities[normalized].append(entity)

    for normalized, variations in grouped_entities.items():
        # Use the most common variation as the standard
        standard_name = max(variations, key=lambda x: entities[x])
        for variation in variations:
            name_mapping[variation] = standard_name

    return name_mapping

# Save the mapping to a file
def save_mapping_to_file(mapping, output_file):
    with open(output_file, "w", encoding="utf-8") as file:
        for key, value in mapping.items():
            file.write(f"{key} -> {value}\n")
    print(f"Mapping saved to {output_file}")

# Main script
entity_counts_file = "entity_counts_sorted.txt"
output_mapping_file = "name_mapping.txt"

entities = load_entity_counts(entity_counts_file)
name_mapping = create_name_mapping(entities)
save_mapping_to_file(name_mapping, output_mapping_file)


Mapping saved to name_mapping.txt
