In [1]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle
import pandas as pd
import re
import string

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    print(url)
    return text

# URLs of transcripts in scope
urls = ['https://scrapsfromtheloft.com/2018/11/21/trevor-noah-son-of-patricia-transcript/',
        'https://scrapsfromtheloft.com/2018/06/29/russell-peters-almost-famous-2016-full-transcript/',
        'https://scrapsfromtheloft.com/2017/05/02/hasan-minhaj-white-house-correspondents-dinner-transcript/',
        ]

# Comedian names
comedians = ['trevor', 'russell', 'hasan']


In [2]:
# # Actually request transcripts (takes a few minutes to run)
transcripts = [url_to_transcript(u) for u in urls]

https://scrapsfromtheloft.com/2018/11/21/trevor-noah-son-of-patricia-transcript/
https://scrapsfromtheloft.com/2018/06/29/russell-peters-almost-famous-2016-full-transcript/
https://scrapsfromtheloft.com/2017/05/02/hasan-minhaj-white-house-correspondents-dinner-transcript/


In [3]:
# # Pickle files for later use

# # Make a new directory to hold the text files
"""
!mkdir transcripts

for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)
"""

'\n!mkdir transcripts\n\nfor i, c in enumerate(comedians):\n    with open("transcripts/" + c + ".txt", "wb") as file:\n        pickle.dump(transcripts[i], file)\n'

In [4]:
# Load pickled files
data = {}
for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [5]:
# Sanity check to make sure data has been loaded properly
data.keys()


dict_keys(['russell', 'trevor', 'hasan'])

In [6]:
data['trevor'][:3]

['A NETFLIX ORIGINAL COMEDY SPECIAL [distant traffic] LIVE NATION PRESENTS TREVOR NOAH',
 '[presenter] Beautiful people, put your hands together for Trevor Noah. [shouting and whooping] [hip hop intro music] [applause]',
 'What’s going on, Los Angeles? [louder cheering] Welcome to the show. Thank you for coming out. Thank you for being here. This is so much fun. Welcome out. Oh, look at all of you. This is so dope. I love LA. I love everything about LA. Even the things people hate about LA, I love. I love the traffic in LA. It’s like one of my favorite experiences. Yeah, when you don’t live here, it’s great. It’s wonderful. Because you get to be a part of it, but it’s not yours forever. It’s like anther person’s child, you know what I’m saying? Yeah, you get to be like, “This is crazy. Have it back.” -That’s what it feels like. -[laughter] -I love it, man. -[indistinct shout] I love the vibe. You know? I love driving out here. You know. And while I’m out here, I get to listen to the ra

In [7]:
# Let's take a look at our data again
next(iter(data.keys()))

'russell'

In [8]:
next(iter(data.values()))

['-Yeah, Russell! Oi, Russell! Your mother is so fat. What the hell did you say to me? Do I look like Ryan’s mom? Somebody gonna get a hurt real bad. Oh, DDR. So good. His best buddy is in the house. Best what? Best buddy. Oh, my God. Thank you. All right! And now, ladies and gentlemen, Canada’s own and the pride of Brampton, Ontario… Toronto, give it up for your hometown boy, Russell… Peters.',
 'Hey, thank you. Thanks. All right. There you are. There you are. There’s my city. Heh. That’s it. That’s what I’m talking about. Hey, give it up for, uh, DJ Spinbad and Starting From Scratch, ladies and gentlemen. Live DJing. Live. Not dead. Live. Asian guy, how are you? Good. You’re looking very anime. -What style..? Are you Filipino? -Chinese. Chinese. That’s the main kind of Asian. That’s.. That’s the one they make the most of. They, uh… You can pick them up fairly cheap. They’re reasonably priced. They’re.. -Heh, what’s your name? -Kat Sang Bong. I’ll.. I’ll do the jokes now. Are you..? Y

In [9]:
# We are going to change this to key: comedian, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [10]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [11]:
pd.set_option('max_colwidth',150)

df = pd.DataFrame.from_dict(data_combined).transpose()
df.columns = ['transcript']
df = df.sort_index()
df

Unnamed: 0,transcript
hasan,Hasan Minhaj’s full comedy routine at the 2017 White House Correspondents’ Dinner. The comedian told one blistering joke after another about the s...
russell,"-Yeah, Russell! Oi, Russell! Your mother is so fat. What the hell did you say to me? Do I look like Ryan’s mom? Somebody gonna get a hurt real bad..."
trevor,"A NETFLIX ORIGINAL COMEDY SPECIAL [distant traffic] LIVE NATION PRESENTS TREVOR NOAH [presenter] Beautiful people, put your hands together for Tre..."


In [12]:
df['transcript']['hasan']

'Hasan Minhaj’s full comedy routine at the 2017 White House Correspondents’ Dinner. The comedian told one blistering joke after another about the sitting President of the United States, Donald Trump, and the media that covers him.\xa0 Okay, listen, I get it. I get it. We gotta address the elephant that’s not in the room. \nThe leader of our country is not here. And that’s because he lives in Moscow. \nIt is a very long flight. It’d be hard for Vlad to make it. \nVlad can’t just make it on a Saturday. It’s a Saturday. * * * Thank you, wow, oh my God. Ladies and gentlemen, welcome to the series finale of the White House correspondents’ dinner. Oh man. My name is Hasan Minhaj, or, as I will be known in a few weeks, Number 830287. Who would have thought, with everything going on in the country, that a Muslim would be standing on this stage — for the ninth year in a row, baby. We had eight years of Barack. What’s another year? I see you, fam. I see you, Barry. What you doing? You jet-skiing

In [13]:
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)


# Let's take a look at the updated text
data_clean = pd.DataFrame(df.transcript.apply(round1))
data_clean



Unnamed: 0,transcript
hasan,hasan minhaj’s full comedy routine at the white house correspondents’ dinner the comedian told one blistering joke after another about the sittin...
russell,yeah russell oi russell your mother is so fat what the hell did you say to me do i look like ryan’s mom somebody gonna get a hurt real bad oh ddr ...
trevor,a netflix original comedy special live nation presents trevor noah beautiful people put your hands together for trevor noah what’s going on l...


In [14]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
hasan,hasan minhajs full comedy routine at the white house correspondents dinner the comedian told one blistering joke after another about the sitting ...
russell,yeah russell oi russell your mother is so fat what the hell did you say to me do i look like ryans mom somebody gonna get a hurt real bad oh ddr s...
trevor,a netflix original comedy special live nation presents trevor noah beautiful people put your hands together for trevor noah whats going on lo...


In [15]:
"""
from nltk.stem import WordNetLemmatizer

def lemmatization_round4(text):
    tokens=word_tokenize(text)
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized

round3 = lambda x: stemming_round3(x)

data_clean = pd.DataFrame(data_clean.transcript.apply(round3))
data_clean
"""

'\nfrom nltk.stem import WordNetLemmatizer\n\ndef lemmatization_round4(text):\n    tokens=word_tokenize(text)\n    wordnet_lemmatizer = WordNetLemmatizer()\n    lemmatized = [wordnet_lemmatizer.lemmatize(token) for token in tokens]\n    return lemmatized\n\nround3 = lambda x: stemming_round3(x)\n\ndata_clean = pd.DataFrame(data_clean.transcript.apply(round3))\ndata_clean\n'

In [16]:
df

Unnamed: 0,transcript
hasan,Hasan Minhaj’s full comedy routine at the 2017 White House Correspondents’ Dinner. The comedian told one blistering joke after another about the s...
russell,"-Yeah, Russell! Oi, Russell! Your mother is so fat. What the hell did you say to me? Do I look like Ryan’s mom? Somebody gonna get a hurt real bad..."
trevor,"A NETFLIX ORIGINAL COMEDY SPECIAL [distant traffic] LIVE NATION PRESENTS TREVOR NOAH [presenter] Beautiful people, put your hands together for Tre..."


In [17]:
# Let's add the comedians' full names as well
full_names = ['Hasan Minhaj', 'Russell Peters', 'Trevor Noah']

df['full_name'] = full_names
df

Unnamed: 0,transcript,full_name
hasan,Hasan Minhaj’s full comedy routine at the 2017 White House Correspondents’ Dinner. The comedian told one blistering joke after another about the s...,Hasan Minhaj
russell,"-Yeah, Russell! Oi, Russell! Your mother is so fat. What the hell did you say to me? Do I look like Ryan’s mom? Somebody gonna get a hurt real bad...",Russell Peters
trevor,"A NETFLIX ORIGINAL COMEDY SPECIAL [distant traffic] LIVE NATION PRESENTS TREVOR NOAH [presenter] Beautiful people, put your hands together for Tre...",Trevor Noah


In [18]:
# Let's pickle it for later use
df.to_pickle("corpus.pkl")


In [19]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index

data_dtm

Unnamed: 0,aah,abandoned,able,absentia,absolute,absolutely,accent,accept,access,accommodating,...,yo,yogurt,york,youd,youll,young,younger,youre,youve,zero
hasan,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,16,0,1
russell,1,0,1,0,1,1,3,0,2,0,...,4,0,3,3,3,3,1,48,7,0
trevor,0,1,0,0,0,0,0,1,0,1,...,8,1,3,3,9,0,0,20,18,0


In [20]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

# Let's also pickle the cleaned data (before we put it in document-term matrix format) 
# and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))