## Data Collection

In [1]:
# importing required Libraries
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

import pickle
import joblib
import os

import re
import string
from langdetect import detect

import imdb
imdb = imdb.IMDb()

import warnings
warnings.filterwarnings('ignore')

In [2]:
def scrape_links(x):
    html_data= requests.get(x).text
    link_data = BeautifulSoup(html_data,"lxml")
    result = [x.get('href') for x in link_data.find(class_="elementor-section elementor-top-section elementor-element elementor-element-b70b8d7 elementor-section-boxed elementor-section-height-default elementor-section-height-default").find_all("a")] 
    return result

def scrape_tags(x):
    html_data= requests.get(x).text
    link_data = BeautifulSoup(html_data,"lxml")
    result = [x.text for x in link_data.find(class_="elementor-section elementor-top-section elementor-element elementor-element-b70b8d7 elementor-section-boxed elementor-section-height-default elementor-section-height-default").find_all("h3")] 
    return result

def scrape_transcript(x):
    html_data= requests.get(x).text
    link_data = BeautifulSoup(html_data,"lxml")
    result = [x.text for x in link_data.find(class_="elementor-element elementor-element-74af9a5b elementor-widget elementor-widget-theme-post-content").find_all("p")] 
    return result

In [4]:
links = scrape_links("https://scrapsfromtheloft.com/stand-up-comedy-scripts/")
tags = scrape_tags("https://scrapsfromtheloft.com/stand-up-comedy-scripts/")
links=links[:4]
transcript = [scrape_transcript(x) for x in links]

In [4]:
frame_tag = pd.DataFrame(tags, columns=["Tag"])
frame_link = pd.DataFrame(links, columns=["URL"])
data = {
    "Tag": tags,
    "URL": links,
    "Raw Transcript": transcript
}
frame = pd.DataFrame(data)
frame.insert(loc=0, column='S No.', value=np.arange(len(frame)))

In [5]:
# Removing unwanted characters from tags and extract name , title and year
frame['Tag'] = frame['Tag'].map(lambda x: x.lstrip('\n\n\t\t\t\t').rstrip('-\n\n\t\t\t\t'))
frame['Names'] = frame['Tag'].str.extract(r'([\w\s.]+)')
frame['Title'] = frame['Tag'].str.extract(r'([\w\s\d.:,’*?!-%]+)')
frame['Year'] = frame['Tag'].str.extract(r'(\d{4})')

In [6]:
if not os.path.exists("transcripts"):
    os.mkdir("transcripts")

# Dumping individual transcripts into text files
Sr = frame['S No.'].tolist()
for i, c in enumerate(Sr):
    with open("transcripts/" + str(c) + ".txt", "wb") as file:
        pickle.dump(frame['Raw Transcript'][i], file)

# Load pickled transcript files
data = {}
for i, c in enumerate(Sr):
    with open("transcripts/" + str(c) + ".txt", "rb") as file:
        data[c] = pickle.load(file)

# Function to combine text from a list of text
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

# Combine the text for each transcript into one string
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

# Create a DataFrame for the combined transcripts
frame_trans = pd.DataFrame.from_dict(data_combined).transpose()
frame_trans.columns = ['Transcript']

## Cleaning The Data

In [7]:
# Define the clean function to preprocess text
def clean(text):
    text = re.sub(r'\[.*?\]', '', text)            # Remove text in square brackets
    text = text.lower()                           # Convert text to lowercase
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)    # Remove punctuation
    text = re.sub('\n', '', text)                  # Remove newlines
    text = re.sub('[‘’“”…]', '', text)             # Remove specific special characters
    text = re.sub('[♪)(“”…]', '', text)            # Remove additional special characters
    text = re.sub('\w*\d\w*', '', text)            # Remove words containing numbers
    return text

# Apply the clean function to the "Transcript" column and create a new DataFrame for preprocessed transcripts
frame_trans = pd.DataFrame(frame_trans.Transcript.apply(clean))

# Concatenate the preprocessed transcripts with the original DataFrame 'frame' along the columns
frame = pd.concat([frame, frame_trans], axis=1)

# Drop any rows with missing values (NaN) from the DataFrame
frame = frame.dropna()

### Get runtime and rating info with an IMDb api

In [8]:
def get_imdb_info(titles):
    runtime = []
    rating = []
    count = -1
    errors = 0
    
    for i in titles:
        count += 1
        # Search the first 30 characters on IMDb
        result = imdb.search_movie(i[:30])
        try:
            mov = imdb.get_movie(result[0].movieID, info=['main'])
            runtime.append(int(mov.get('runtimes')[0]))
            rating.append(mov.get('rating'))
        except:
            runtime.append('')
            rating.append('')
            errors += 1
            # print(f'Error on index {count}, title: {i}')
    print(f'Total not found: {errors}')
    return pd.Series(runtime), pd.Series(rating)
    
runtime, rating = get_imdb_info(frame.Title)

frame['runtime'] = runtime
frame['rating'] = rating

# Replace empty values with NaN
frame = frame.replace(r'^\s*$', np.NaN, regex=True)

Total not found: 32


In [14]:
# Dectect language based on the first 500 characters and create a language column in the df
frame['language'] = frame.Transcript.apply(lambda x: detect(x[:500]))
print(frame.language.value_counts())

language
en    398
it      6
es      1
Name: count, dtype: int64


In [11]:
# Get indices for which transcripts are empty
drop_indices = frame[frame.Transcript == ''].index
frame.drop(drop_indices , inplace=True)
frame = frame.reset_index(drop=True)

In [16]:
# Saving the csv file
frame.to_csv("frame.csv", index=False)

In [98]:
df=pd.read_csv(r"D:\PROJECTS\transnlp\data\raw\scraped_and_cleaned_content_data.csv")

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   S No.           500 non-null    int64  
 1   Tag             500 non-null    object 
 2   URL             500 non-null    object 
 3   Raw Transcript  500 non-null    object 
 4   Transcript      500 non-null    object 
 5   CleanTag        500 non-null    object 
 6   Year            465 non-null    float64
 7   Names           500 non-null    object 
 8   Title           480 non-null    object 
 9   runtime         434 non-null    float64
 10  rating          425 non-null    float64
 11  language        500 non-null    object 
dtypes: float64(3), int64(1), object(8)
memory usage: 47.0+ KB


In [100]:
df.head()

Unnamed: 0,S No.,Tag,URL,Raw Transcript,Transcript,CleanTag,Year,Names,Title,runtime,rating,language
0,0,Michelle Buteau: Welcome to Buteaupia (2020) ...,https://scrapsfromtheloft.com/comedy/michelle-...,['Michelle Buteau’s Netflix special Welcome to...,michelle buteaus netflix special welcome to bu...,Michelle Buteau: Welcome to Buteaupia (2020),2020.0,Michelle Buteau,Welcome to Buteaupia,58.0,7.0,en
1,1,Theo Von: No Offense (2016) | Transcript,https://scrapsfromtheloft.com/comedy/theo-von-...,['Theo Von: No Offense was recorded at the Civ...,theo von no offense was recorded at the civic ...,Theo Von: No Offense (2016),2016.0,Theo Von,No Offense,67.0,5.8,en
2,2,Nate Bargatze’s Nashville Christmas (2024) | T...,https://scrapsfromtheloft.com/comedy/nate-barg...,['Nate Bargatze’s Nashville Christmas is a hea...,nate bargatzes nashville christmas is a heartw...,Nate Bargatze’s Nashville Christmas (2024),2024.0,Nate Bargatze’s,Nashville Christmas,61.0,6.8,en
3,3,"Your Friend, Nate Bargatze (2024) | Transcript",https://scrapsfromtheloft.com/comedy/your-frie...,"['Your Friend, Nate Bargatze (2024)\nGenre: Co...",your friend nate bargatze comedy standupdirec...,"Your Friend, Nate Bargatze (2024)",2024.0,Nate Bargatze,"Your Friend,",63.0,7.2,en
4,4,Ronny Chieng: Love to Hate It (2024) | Transcript,https://scrapsfromtheloft.com/comedy/ronny-chi...,"['[tuning]', '[gentle Hawaiian music playing o...",tuning gentle hawaiian music playing over radi...,Ronny Chieng: Love to Hate It (2024),2024.0,Ronny Chieng,Love to Hate It,65.0,7.1,en


In [68]:
import requests
from bs4 import BeautifulSoup
import re

def scrape_imdb_details(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }

    details = {"title": "Not found", "runtime": "Not found", "rating": "Not found"}

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # --- Title ---
        title_element = soup.find('h1', {'data-testid': 'hero__pageTitle'})
        if title_element:
            details['title'] = title_element.get_text(strip=True)

        # --- Runtime ---
        for li in soup.find_all('li', class_='ipc-inline-list__item'):
            text = li.get_text(strip=True)
            if re.match(r'^\d+h\s*\d*m?$|^\d+m$', text):
                hours = re.search(r'(\d+)h', text)
                minutes = re.search(r'(\d+)m', text)
                total_minutes = 0
                if hours:
                    total_minutes += int(hours.group(1)) * 60
                if minutes:
                    total_minutes += int(minutes.group(1))
                details['runtime'] = total_minutes  # integer value
                break
        # --- Rating ---
        rating_element = soup.find('span', class_='ipc-rating-star--rating') or \
                         soup.find('span', {'data-testid': 'hero-rating-bar__aggregate-rating__score'})
        if rating_element:
            details['rating'] = rating_element.get_text(strip=True)

    except Exception as e:
        print(f"Scraping error: {e}")

    return details

if __name__ == "__main__":
    url = "https://www.imdb.com/title/tt34253921/reference"
    details = scrape_imdb_details(url)

    print("\n--- Scraped Details from IMDb ---")
    for key, value in details.items():
        print(f"  {key.title()}: {value}")



--- Scraped Details from IMDb ---
  Title: Fortune Feimster: Crushing It(2024)
  Runtime: 58
  Rating: 6.9
