In [1]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [2]:
import pandas as pd

In [27]:
### Imports
import string
import re 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Preprocessing of Text for Analysis Purposes

## Import filtered dataframe

In [4]:
df = pd.read_json('../raw_data/df_with_age_gender.json')

In [5]:
pd.set_option("display.max_rows", None)

In [6]:
len(df)

555

## Cleaning Functions

In [92]:
### Specific functions to our data - Scraps from the Loft and BBC

# remove notes (Bo Burnham Only)
def clean_bo(text):
    txt = text
    for note in '♫♪':
        txt = txt.replace(note, '')
    return txt

def remove_music(text):
    text = re.sub('♪.*?♪', '', text) # remove ♪ stuff that looks like this ♪
    text = re.sub('♫.*?♫', '', text) # remove ♫ stuff that looks like this ♫
    return text

def remove_bracketed(text):
    text = re.sub('\[.*?\]', '', text) # remove [stuff that looks like this]
    text = re.sub('\(.*?\)', '', text) # remove (stuff that looks like this)
    return text

def remove_speaker_tags(text):
    text = re.sub('\s[\w-]+( \w+)?:\s', ' ', text) # remove Word: or Word word: with a newline or space before
    return text

def remove_info(text):
    text = re.sub('subtitle(s)? by .*', '', str(text)) # remove subtile(s) by xxxx
    text = re.sub('(a)? netflix (original )?(comedy )?(special ?)?', '', text) # remove A Netflix Original Comedy Special
    text = re.sub('(this )?(programme )?(contains )?(very |some )?strong language( |\.)', '', text) # remove strong language
    text = re.sub('adult humou?r( |\.?)?', '', text) # remove adult humour
    text = re.sub('(original )?air date', '', text) # remove air date
    return text

In [12]:
# general functions for text pre-processing
def remove_punc(text, chars):
    txt = text
    for punc in chars:
        txt = txt.replace(punc, '')
    return txt

def remove_num(text):
    return ''.join(char for char in text if not char.isdigit())

def remove_stopw(text, word_list):
    word_tokens = word_tokenize(text)
    return ' '.join(w for w in word_tokens if not w in word_list)

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split(' ') if len(lemmatizer.lemmatize(word))>2)

In [13]:
lemmatizer_dict = {'got': 'get',
                  'gon': 'go',
                  'said': 'say',
                  'fucking': 'fuck',
                  'went': 'go',
                  'finding': 'find'}

def manual_lemmatizer(text):
    for k, v in lemmatizer_dict.items():
        text.replace(k, v)
    return text

## Modifying & applying removal lists 

In [93]:
clean_df = df.copy()

### Remove everything in Brackets, Music notes

In [94]:
clean_df['full_transcript_clean'] = clean_df['full_transcript'].apply(remove_bracketed)

In [95]:
### clean Bo before removing music
clean_df['full_transcript_clean'][clean_df['artist']=='Bo Burnham'] = clean_df['full_transcript_clean'][clean_df['artist']=='Bo Burnham'].apply(clean_bo)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['full_transcript_clean'][clean_df['artist']=='Bo Burnham'] = clean_df['full_transcript_clean'][clean_df['artist']=='Bo Burnham'].apply(clean_bo)


In [96]:
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_music)

### Lowercase, remove useless regex matches
Including specific scraps/BBC format

In [97]:
# lowercase all the words
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].str.lower()

In [99]:
# remove speaker tags and info regexes
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_info)
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_speaker_tags)

In [100]:
clean_df

Unnamed: 0,full_transcript,artist,show_name,year,source,artist_birthday,artist_gender,age_then,full_transcript_clean
0,[rock music playing]\n\n[indistinct chatter]\n...,Adam Devine,Best Time Of Our Lives,2019.0,Scraps from the Loft,1983.0,2,36.0,"\n\n\n\nhey, man. how are you?\n\n\n\nthank yo..."
1,"strong language. CHEERING Yes, yes, yes! How...",Adam Hess,Live from the BBC,2016.0,BBC,1990.0,2,26.0,"cheering yes, yes, yes! how are you doing? ..."
2,"Ladies and gentlemen, please welcome your hos...",Adam Hills,Live at the Apollo Series 9 Episode 4,2013.0,BBC,1970.0,2,43.0,"ladies and gentlemen, please welcome your hos..."
3,some strong language and adult humour Ladies...,Adam Hills,Live at the Apollo Series 12 Episode 4,2016.0,BBC,1970.0,2,46.0,"and ladies and gentlemen, please welcome you..."
4,"Adam Hills? CHEERING Hello, Apollo. I am going...",Adam Hills,Live at the Apollo Series 5 Episode 5,2009.0,BBC,1970.0,2,39.0,"adam hills? cheering hello, apollo. i am going..."
5,"[man] Okay, ready, and… Take your own cue, Ada...",Adam Sandler,100% Fresh,2018.0,Scraps from the Loft,1966.0,2,52.0,"okay, ready, and… take your own cue, adam. an..."
6,A NETFLIX COMEDY SPECIAL\nRecorded at the Casi...,Adel Karam,Live From Beirut,2018.0,Scraps from the Loft,1972.0,2,46.0,"\nrecorded at the casino du liban, beirut\n\nh..."
7,Aisling Bea! APPLAUSE AND CHEERING Hello! Hell...,Aisling Bea,Live at the Apollo Series 10 Episode 3,2014.0,BBC,1984.0,1,30.0,aisling bea! applause and cheering hello! hell...
8,[dog barks] [FisherGreen’s Sisters Brothers pl...,Al Madrigal,Why Is The Rabbit Crying?,2013.0,Scraps from the Loft,1971.0,2,42.0,"ii – ladies and gentlemen, please give a war..."
9,"Ladies and gentlemen, please welcome your hos...",Al Murray,Live at the Apollo Series 4 Episode 2,2008.0,BBC,1968.0,2,40.0,"ladies and gentlemen, please welcome your hos..."


### Remove numbers and stopwords + common comedy words, remove punctuation

In [101]:
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_num)

In [102]:
### additional words to remove from the scripts
words_to_remove = ['thank', 'cheering', 'recorded', 'applause', 'laughter', 'laughing', 'murmuring', 'chatter',
                       'aired', 'filmed', 'ladies', 'gentlemen', 'welcome', 'stage', 'transcript', 'netflix']
                    
                    # 'know'? 'go'? 'fuck'?
    
                    # haven't left 'i'm' etc. as those should be cleaned up
                    # by a mixture of stopwords, punctuation removeal, lemmatizing and minimum length

stopwords_plus = words_to_remove + stopwords.words('english')

clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_stopw, args=(stopwords_plus,))

In [103]:
punc_added = string.punctuation + '“”‘’…♪♫¶'

clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_punc, args=(punc_added,))

### Lemmatize

In [104]:
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(lemmatize).apply(manual_lemmatizer)

In [106]:
print(clean_df['full_transcript_clean'].iloc[69])

southern california ready good time tonight get excited make loud brad williams goin everybody way live dammit gon try guy ready party guy ready lot fuckin fun tonight good good like havin fun like party know party like drink one main reason like drink never pay ever cause everyone want know happens dwarf get drunk right thinking wonder happens would really like know that know like hot chick hot chick thing common walk club everyone look gettin fucked tonight true know nothing happens drink nothing crazy like know one friend told like man get midget drunk explode gold coin ridiculous like awesome nothing happens basically drink thing hot chick drink get emotional probably text end night might blow dude also say thing drunk would absolutely never say sober circumstance like long ago watching game buddy team put money excited turned friend said something would never say sober turn yeah chest bump never say shit never say friend six foot two okay six foot two sorry burst bubble tall frien