## Data Collection

Scrape The Website : https://scrapsfromtheloft.com/stand-up-comedy-scripts/ -> Preprocess and Do Some Cleaning -> Save To csv for further analysis .

In [1]:
# importing required Libraries
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

import pickle
import joblib
import os

import re
import string
from langdetect import detect

import imdb
imdb = imdb.IMDb()

import warnings
warnings.filterwarnings('ignore')

In [2]:
def scrape_links(x):
    html_data= requests.get(x).text
    link_data = BeautifulSoup(html_data,"lxml")
    result = [x.get('href') for x in link_data.find(class_="elementor-section elementor-top-section elementor-element elementor-element-b70b8d7 elementor-section-boxed elementor-section-height-default elementor-section-height-default").find_all("a")] 
    return result

def scrape_tags(x):
    html_data= requests.get(x).text
    link_data = BeautifulSoup(html_data,"lxml")
    result = [x.text for x in link_data.find(class_="elementor-section elementor-top-section elementor-element elementor-element-b70b8d7 elementor-section-boxed elementor-section-height-default elementor-section-height-default").find_all("h3")] 
    return result

def scrape_transcript(x):
    html_data= requests.get(x).text
    link_data = BeautifulSoup(html_data,"lxml")
    result = [x.text for x in link_data.find(class_="elementor-element elementor-element-74af9a5b elementor-widget elementor-widget-theme-post-content").find_all("p")] 
    return result

In [3]:
links = scrape_links("https://scrapsfromtheloft.com/stand-up-comedy-scripts/")
tags = scrape_tags("https://scrapsfromtheloft.com/stand-up-comedy-scripts/")
transcript = [scrape_transcript(x) for x in links]

In [4]:
frame_tag = pd.DataFrame(tags, columns=["Tag"])
frame_link = pd.DataFrame(links, columns=["URL"])
data = {
    "Tag": tags,
    "URL": links,
    "Raw Transcript": transcript
}
frame = pd.DataFrame(data)
frame.insert(loc=0, column='S No.', value=np.arange(len(frame)))

In [5]:
# Removing unwanted characters from tags and extract name , title and year
frame['Tag'] = frame['Tag'].map(lambda x: x.lstrip('\n\n\t\t\t\t').rstrip('-\n\n\t\t\t\t'))
frame['Names'] = frame['Tag'].str.extract(r'([\w\s.]+)')
frame['Title'] = frame['Tag'].str.extract(r'([\w\s\d.:,’*?!-%]+)')
frame['Year'] = frame['Tag'].str.extract(r'(\d{4})')

In [6]:
if not os.path.exists("transcripts"):
    os.mkdir("transcripts")

# Dumping individual transcripts into text files
Sr = frame['S No.'].tolist()
for i, c in enumerate(Sr):
    with open("transcripts/" + str(c) + ".txt", "wb") as file:
        pickle.dump(frame['Raw Transcript'][i], file)

# Load pickled transcript files
data = {}
for i, c in enumerate(Sr):
    with open("transcripts/" + str(c) + ".txt", "rb") as file:
        data[c] = pickle.load(file)

# Function to combine text from a list of text
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

# Combine the text for each transcript into one string
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

# Create a DataFrame for the combined transcripts
frame_trans = pd.DataFrame.from_dict(data_combined).transpose()
frame_trans.columns = ['Transcript']

## Cleaning The Data

In [7]:
# Define the clean function to preprocess text
def clean(text):
    text = re.sub(r'\[.*?\]', '', text)            # Remove text in square brackets
    text = text.lower()                           # Convert text to lowercase
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)    # Remove punctuation
    text = re.sub('\n', '', text)                  # Remove newlines
    text = re.sub('[‘’“”…]', '', text)             # Remove specific special characters
    text = re.sub('[♪)(“”…]', '', text)            # Remove additional special characters
    text = re.sub('\w*\d\w*', '', text)            # Remove words containing numbers
    return text

# Apply the clean function to the "Transcript" column and create a new DataFrame for preprocessed transcripts
frame_trans = pd.DataFrame(frame_trans.Transcript.apply(clean))

# Concatenate the preprocessed transcripts with the original DataFrame 'frame' along the columns
frame = pd.concat([frame, frame_trans], axis=1)

# Drop any rows with missing values (NaN) from the DataFrame
frame = frame.dropna()

### Get runtime and rating info with an IMDb api

In [8]:
def get_imdb_info(titles):
    runtime = []
    rating = []
    count = -1
    errors = 0
    
    for i in titles:
        count += 1
        # Search the first 30 characters on IMDb
        result = imdb.search_movie(i[:30])
        try:
            mov = imdb.get_movie(result[0].movieID, info=['main'])
            runtime.append(int(mov.get('runtimes')[0]))
            rating.append(mov.get('rating'))
        except:
            runtime.append('')
            rating.append('')
            errors += 1
            # print(f'Error on index {count}, title: {i}')
    print(f'Total not found: {errors}')
    return pd.Series(runtime), pd.Series(rating)
    
runtime, rating = get_imdb_info(frame.Title)

frame['runtime'] = runtime
frame['rating'] = rating

# Replace empty values with NaN
frame = frame.replace(r'^\s*$', np.NaN, regex=True)

Total not found: 32


In [14]:
# Dectect language based on the first 500 characters and create a language column in the df
frame['language'] = frame.Transcript.apply(lambda x: detect(x[:500]))
print(frame.language.value_counts())

language
en    398
it      6
es      1
Name: count, dtype: int64


In [11]:
# Get indices for which transcripts are empty
drop_indices = frame[frame.Transcript == ''].index
frame.drop(drop_indices , inplace=True)
frame = frame.reset_index(drop=True)

In [16]:
# Saving the csv file
frame.to_csv("frame.csv", index=False)

In [2]:
import os
import joblib

def create_joblib(folder_path):
    transcripts = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='latin-1') as file:
                transcript = file.read()
                transcripts.append(transcript)

    joblib.dump(transcripts, 'transcripts_joblib')

# Example usage
folder_path = 'D:\\NLP\\Transcript Analysis\\Transcript Analysis\\transcripts'
create_joblib(folder_path)
