In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
import re
import time

base_url = "https://ogjre.com/transcripts"
output_file = "filtered_transcripts.txt"

def setup_driver():
    options = webdriver.ChromeOptions()
    # Uncomment to run in headless mode
    # options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    return webdriver.Chrome(options=options)

def fetch_all_episodes(driver):
    driver.get(base_url)
    time.sleep(5)
    print("Webpage loaded. Starting to scroll...")

    episodes = []
    scroll_pause_time = 2
    max_no_new_content_attempts = 10
    no_new_content_attempts = 0
    last_height = driver.execute_script("return document.body.scrollHeight")

    while no_new_content_attempts < max_no_new_content_attempts:
        try:
            episode_elements = driver.find_elements(By.CLASS_NAME, "VideoSingle__VideoSingleStyles-sc-dngnuh-0")
            new_episodes_found = False

            for episode in episode_elements:
                try:
                    link = episode.find_element(By.TAG_NAME, "a").get_attribute("href")
                    title = episode.find_element(By.CLASS_NAME, "vs-video-title").text
                    if (title, link) not in episodes:
                        episodes.append((title, link))
                        new_episodes_found = True
                except Exception as e:
                    print(f"Error extracting episode: {e}")

            if new_episodes_found:
                no_new_content_attempts = 0
            else:
                no_new_content_attempts += 1

            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        except Exception as e:
            print(f"Error during scrolling: {e}")
            break

    return episodes

def fetch_transcript(episode_url):
    print(f"Fetching transcript for {episode_url}...")
    response = requests.get(episode_url)
    if response.status_code != 200:
        return "Transcript not available."

    soup = BeautifulSoup(response.content, "html.parser")
    transcript_tag = soup.find("p", class_="chakra-text ssc-transcript css-0")
    return transcript_tag.get_text(strip=True) if transcript_tag else "Transcript not available."

def is_valid_episode_title(title):
    return bool(re.match(r"^#\d+\s*-\s*.+", title))

def main():
    driver = setup_driver()
    try:
        episodes = fetch_all_episodes(driver)
        print(f"Found {len(episodes)} episodes.")

        # Filter episodes by valid title format
        valid_episodes = [(title, link) for title, link in episodes if is_valid_episode_title(title)]
        print(f"Found {len(valid_episodes)} valid episodes.")

        # Save filtered transcripts
        with open(output_file, "w", encoding="utf-8") as file:
            for title, link in valid_episodes:
                print(f"Scraping episode: {title}")
                transcript = fetch_transcript(link)
                file.write(f"Episode Title: {title}\n")
                file.write(f"Transcript:\n{transcript}\n")
                file.write("=" * 80 + "\n")

        print(f"Filtered transcripts saved in {output_file}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()


In [1]:
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json
import re

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Clean text
def clean_text(text):
    text = re.sub(r"\s+", " ", text)  # Remove excessive whitespace
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = text.lower()
    return text

# Extract named entities (people)
def extract_named_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

# Perform sentiment analysis
def analyze_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment["compound"]  # Return compound sentiment score

# Preprocess each transcript
def preprocess_transcripts(input_file, output_file):
    processed_data = []

    with open(input_file, "r", encoding="utf-8") as f:
        episodes = f.read().split("=" * 80)  # Split episodes by separator

    for episode in episodes:
        if not episode.strip():
            continue
        lines = episode.strip().split("\n")
        title = lines[0].replace("Episode Title: ", "").strip()
        transcript = "\n".join(lines[1:]).replace("Transcript:\n", "").strip()

        cleaned_transcript = clean_text(transcript)
        named_entities = extract_named_entities(transcript)
        sentiment_score = analyze_sentiment(transcript)

        processed_data.append({
            "Episode Title": title,
            "Cleaned_Transcript": cleaned_transcript,
            "Named_Entities": named_entities,
            "Sentiment_Score": sentiment_score
        })

    # Save processed data to a JSON file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(processed_data, f, indent=4)

    print(f"Processed data saved to {output_file}")

# Example usage
preprocess_transcripts("joe_rogan_filtered_transcript.txt", "processed_transcripts.json")


Processed data saved to processed_transcripts.json
