In [11]:
import string
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
pd.set_option("display.max_rows", None)

In [3]:
new_df = pd.read_json('../raw_data/all_data_df.json')
new_df.head()

Unnamed: 0,full_transcript,artist,show_name,year,source,artist_birthday,artist_gender,age_then
0,[rock music playing]\n\n[indistinct chatter]\n...,Adam Devine,Best Time Of Our Lives,2019.0,Scraps from the Loft,1983.0,2,36.0
1,"strong language. CHEERING Yes, yes, yes! How...",Adam Hess,Live from the BBC,2016.0,BBC,1990.0,2,26.0
2,"Ladies and gentlemen, please welcome your hos...",Adam Hills,Live at the Apollo Series 9 Episode 4,2013.0,BBC,1970.0,2,43.0
3,some strong language and adult humour Ladies...,Adam Hills,Live at the Apollo Series 12 Episode 4,2016.0,BBC,1970.0,2,46.0
4,"? CHEERING Hello, Apollo. I am going to start ...",Adam Hills,Live at the Apollo Series 5 Episode 5,2009.0,BBC,1970.0,2,39.0


# Cleaning functions

In [4]:
### Specific functions to our data - Scraps from the Loft and BBC

# remove notes (Bo Burnham Only)
def clean_bo(text):
    txt = text
    for note in '♫♪':
        txt = txt.replace(note, '')
    return txt

def remove_music(text):
    text = re.sub('♪.*?♪', '', text) # remove ♪ stuff that looks like this ♪
    text = re.sub('♫.*?♫', '', text) # remove ♫ stuff that looks like this ♫
    return text

def remove_bracketed(text):
    text = re.sub('\[.*?\]', '', text) # remove [stuff that looks like this]
    text = re.sub('\(.*?\)', '', text) # remove (stuff that looks like this)
    return text

def remove_speaker_tags(text):
    text = re.sub('\s[\w-]+( \w+)?:\s', ' ', text) # remove Word: or Word word: with a newline or space before
    return text

def remove_info(text):
    text = re.sub('subtitle(s)? by .*', '', str(text)) # remove subtile(s) by xxxx
    text = re.sub('(a)? netflix (original )?(comedy )?(special ?)?', '', text) # remove A Netflix Original Comedy Special
    text = re.sub('(this )?(programme )?(contains )?(very |some )?strong language( |\.)', '', text) # remove strong language
    text = re.sub('adult humou?r( |\.?)?', '', text) # remove adult humour
    text = re.sub('(original )?air date', '', text) # remove air date
    return text

In [5]:
# general functions for text pre-processing
def remove_punc(text, chars):
    txt = text
    for punc in chars:
        txt = txt.replace(punc, '')
    return txt

def remove_num(text):
    return ''.join(char for char in text if not char.isdigit())

def remove_stopw(text, word_list):
    word_tokens = word_tokenize(text)
    return ' '.join(w for w in word_tokens if not w in word_list)

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split(' ') if len(lemmatizer.lemmatize(word))>2)

In [6]:
lemmatizer_dict = {'got': 'get',
                  'gon': 'go',
                  'said': 'say',
                   'saying': 'say',
                  'fucking': 'fuck',
                  'went': 'go',
                  'finding': 'find',
                  'getting': 'get'}

def manual_lemmatizer(text):
    for k, v in lemmatizer_dict.items():
        text = text.replace(k, v)
    return text

# Cleaning all in one

In [7]:
clean_df = new_df.copy()

clean_df['full_transcript_clean'] = clean_df['full_transcript'].apply(remove_bracketed)

### clean Bo before removing music
clean_df['full_transcript_clean'][clean_df['artist']=='Bo Burnham'] = clean_df[
    'full_transcript_clean'][clean_df['artist']=='Bo Burnham'].apply(clean_bo)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['full_transcript_clean'][clean_df['artist']=='Bo Burnham'] = clean_df[


In [12]:
### additional words to remove from the scripts
words_to_remove = ['thank', 'cheering', 'recorded', 'applause', 'laughter', 'laughing', 'murmuring', 'chatter',
                       'aired', 'filmed', 'ladies', 'gentlemen', 'welcome', 'stage', 'transcript', 'netflix',
                  'apollo', 'like', 'goodnight', 'mutter', 'noo', 'nuh', 'oof', 'maan', 'fuck', 'cause', 'okay', 
                   'hey', 'also', 'someone', 'somebody', 'everybody', 'also', 'part' , 'sometimes', 'maybe', 
                   'three', 'second', 'everything', 'minute', 'name', 'kind', 'point', 'yeah', 'hello', 'one', 
                   'two', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'whine', 'hnn', 'malla', 'letta', 'namoo', 
                   'getta', 'nama', 'mana', 'chk', 'manoo', 'hadda', 'ama', 'carlin', 'host']
                    
                    # 'know'? 'go'? 'fuck'?
    
                    # haven't left 'i'm' etc. as those should be cleaned up
                    # by a mixture of stopwords, punctuation removeal, lemmatizing and minimum length

stopwords_plus = words_to_remove + stopwords.words('english')

punc_added = string.punctuation + '“”‘’…♪♫¶'

In [8]:
def preproc(text):
    text = text.apply(remove_music).str.lower().apply(
        remove_info).apply(remove_speaker_tags).apply(
        remove_num).apply(remove_stopw, args=(
        stopwords_plus,)).apply(remove_punc, args=(
        punc_added,)).apply(lemmatize).apply(
        manual_lemmatizer)
    return text

In [13]:
clean_df['full_transcript_clean'] = preproc(clean_df['full_transcript_clean'])
clean_df.head()

Unnamed: 0,full_transcript,artist,show_name,year,source,artist_birthday,artist_gender,age_then,full_transcript_clean
0,[rock music playing]\n\n[indistinct chatter]\n...,Adam Devine,Best Time Of Our Lives,2019.0,Scraps from the Loft,1983.0,2,36.0,man let right guy much take seat guy get jacke...
1,"strong language. CHEERING Yes, yes, yes! How...",Adam Hess,Live from the BBC,2016.0,BBC,1990.0,2,26.0,yes yes yes well lovely going apologise state ...
2,"Ladies and gentlemen, please welcome your hos...",Adam Hills,Live at the Apollo Series 9 Episode 4,2013.0,BBC,1970.0,2,43.0,please tonight london london live amazing act ...
3,some strong language and adult humour Ladies...,Adam Hills,Live at the Apollo Series 12 Episode 4,2016.0,BBC,1970.0,2,46.0,please tonight know hair lost bet british para...
4,"? CHEERING Hello, Apollo. I am going to start ...",Adam Hills,Live at the Apollo Series 5 Episode 5,2009.0,BBC,1970.0,2,39.0,going start say something probably never heard...


In [None]:
# pd.DataFrame.to_json(clean_df, 'fully_stripped_df.json')