# Preprocessing of Text for Analysis Purposes

## Import filtered dataframe

In [37]:
import pandas as pd

df = pd.read_json('../raw_data/filtered_scraps_from_the_loft.json')

In [38]:
pd.set_option("display.max_rows", None)

In [28]:
df

Unnamed: 0,url,title,full_transcript,full_words
0,https://scrapsfromtheloft.com/comedy/jim-gaffi...,Jim Gaffigan: Comedy Monster (2021) | Transcript,"Thank you! Thank you! Oh, my gosh. Thank you s...","[Thank, you, Thank, you, Oh, my, gosh, Thank, ..."
1,https://scrapsfromtheloft.com/comedy/louis-c-k...,Louis C. K.: Sorry (2021) | Transcript,♪♪ [“Like a Rolling Stone” by Bob Dylan playin...,"[Like, a, Rolling, Stone, by, Bob, Dylan, play..."
2,https://scrapsfromtheloft.com/comedy/drew-mich...,Drew Michael: Red Blue Green (2021) | Transcript,(EMOTIONAL MUSIC PLAYING)\n\n(MUSIC ENDS)\n\nD...,"[EMOTIONAL, MUSIC, PLAYING, MUSIC, ENDS, DREW,..."
3,https://scrapsfromtheloft.com/comedy/mo-amer-m...,Mo Amer: Mohammed in Texas (2021) | Transcript,[quirky flute music playing]\n\n[single note p...,"[quirky, flute, music, playing, single, note, ..."
4,https://scrapsfromtheloft.com/comedy/dave-chap...,Dave Chappelle: The Closer (2021) | Transcript,[audience murmuring]\n\n[murmuring continues]\...,"[audience, murmuring, murmuring, continues, au..."
5,https://scrapsfromtheloft.com/comedy/kathleen-...,Kathleen Madigan: Bothering Jesus (2016) – Tra...,[chattering]\n\n[man] Whoo!\n\n[chattering]\n\...,"[chattering, man, Whoo, chattering, woman, Kat..."
6,https://scrapsfromtheloft.com/comedy/kathleen-...,Kathleen Madigan: Madigan Again (2013) – Trans...,"As our friend Jeff Harmon says, you know a cit...","[As, our, friend, Jeff, Harmon, says, you, kno..."
7,https://scrapsfromtheloft.com/comedy/phil-wang...,Phil Wang: Philly Philly Wang Wang (2021) – Tr...,[hip-hop music playing]\n\n[audience cheering ...,"[hiphop, music, playing, audience, cheering, a..."
8,https://scrapsfromtheloft.com/comedy/dave-chap...,Dave Chappelle: 8:46 – Transcript,8:46 is a performance special by comedian Dave...,"[846, is, a, performance, special, by, comedia..."
9,https://scrapsfromtheloft.com/comedy/tom-papa-...,Tom Papa: You’re Doing Great! (2020) – Transcript,"[applause, whooping]\n\n[presenter] Ladies and...","[applause, whooping, presenter, Ladies, and, g..."


## Cleaning Functions

In [32]:
### Imports
import string
import re 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [41]:
### Specific functions to standup/scraps from the loft
def remove_music(text):
    text = re.sub('♪.*?♪', '', text) # remove ♪ stuff that looks like this ♪
    return text

def remove_bracketed(text):
    text = re.sub('\[.*?\]', '', text) # remove [stuff that looks like this]
    text = re.sub('\(.*?\)', '', text) # remove (stuff that looks like this)
    return text

def remove_useless(text):
    text = re.sub('\n\w+\(\s\w+\)?\:\s', '', text) # remove Word: or Word word: with a newline before
    text = re.sub('subtitles? by \w+', '', text) # remove subtile(s) by xxxx
    return text

In [86]:
# general functions for text pre-processing
def remove_punc(text, chars):
    txt = text
    for punc in chars:
        txt = txt.replace(punc, '')
    return txt

def remove_num(text):
    return ''.join(char for char in text if not char.isdigit())

def remove_stopw(text, word_list):
    word_tokens = word_tokenize(text)
    return ' '.join(w for w in word_tokens if not w in word_list)

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split(' '))

## Modifying & applying removal lists 

In [74]:
clean_df = df.copy()

### Remove everything in Brackets, Music notes

In [75]:
clean_df['full_transcript_clean'] = clean_df['full_transcript'].apply(remove_bracketed)

In [76]:
## TODO: Remove ♪ from specific comedians:
# [Bo Burnham]

In [77]:
# clean_df['full_transcript_clean'] = clean_df['full_transcript'].apply(remove_music)

### Lowercase, remove useless regex matches and remove punctuation
Including specific scraps format

In [78]:
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].str.lower()

In [79]:
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_useless)

In [80]:
punc_added = string.punctuation + '“”‘’…'

clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_punc, args=(punc_added,))

### Remove numbers and stopwords + common comedy words

In [88]:
words_to_remove = ['thank', 'cheering', 'recorded', 'applause', 'laughter', 'laughing', 'murmuring', 'chatter',
                       'aired', 'filmed', 'ladies', 'gentlemen', 'thats', 'im']
# other possible removals 'netflix special', 'full transcript' 

In [89]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [90]:
stopwords_plus = words_to_remove + stopwords.words('english')

In [91]:
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_num)

In [92]:
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_stopw, args=(stopwords_plus,))

In [93]:
clean_df

Unnamed: 0,url,title,full_transcript,full_transcript_clean
0,https://scrapsfromtheloft.com/comedy/jim-gaffi...,Jim Gaffigan: Comedy Monster (2021) | Transcript,"Thank you! Thank you! Oh, my gosh. Thank you s...",oh gosh much aw much aw nice almost makes forg...
1,https://scrapsfromtheloft.com/comedy/louis-c-k...,Louis C. K.: Sorry (2021) | Transcript,♪♪ [“Like a Rolling Stone” by Bob Dylan playin...,♪♪ ♪♪ ♪ upon time dressed fine ♪ ♪ threw bums ...
2,https://scrapsfromtheloft.com/comedy/drew-mich...,Drew Michael: Red Blue Green (2021) | Transcript,(EMOTIONAL MUSIC PLAYING)\n\n(MUSIC ENDS)\n\nD...,drew michael nice see people hard time people ...
3,https://scrapsfromtheloft.com/comedy/mo-amer-m...,Mo Amer: Mohammed in Texas (2021) | Transcript,[quirky flute music playing]\n\n[single note p...,gentleman feel excitement air go houston texas...
4,https://scrapsfromtheloft.com/comedy/dave-chap...,Dave Chappelle: The Closer (2021) | Transcript,[audience murmuring]\n\n[murmuring continues]\...,♪ listen carefully ♪ ♪ favorite band human bei...
5,https://scrapsfromtheloft.com/comedy/kathleen-...,Kathleen Madigan: Bothering Jesus (2016) – Tra...,[chattering]\n\n[man] Whoo!\n\n[chattering]\n\...,whoo ♪ kathleen ♪ ♪ madigan ♪ ♪ kathleen ♪ ♪ m...
6,https://scrapsfromtheloft.com/comedy/kathleen-...,Kathleen Madigan: Madigan Again (2013) – Trans...,"As our friend Jeff Harmon says, you know a cit...",friend jeff harmon says know citys great welco...
7,https://scrapsfromtheloft.com/comedy/phil-wang...,Phil Wang: Philly Philly Wang Wang (2021) – Tr...,[hip-hop music playing]\n\n[audience cheering ...,right wow nice oh wow gosh way come please nic...
8,https://scrapsfromtheloft.com/comedy/dave-chap...,Dave Chappelle: 8:46 – Transcript,8:46 is a performance special by comedian Dave...,performance special comedian dave chappelle vi...
9,https://scrapsfromtheloft.com/comedy/tom-papa-...,Tom Papa: You’re Doing Great! (2020) – Transcript,"[applause, whooping]\n\n[presenter] Ladies and...",tom papa look look new jersey yeah im people d...
