In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

# Preprocessing of Text for Analysis Purposes

## Import filtered dataframe

In [None]:
import pandas as pd

df = pd.read_json('../raw_data/scraps_clean_new.json')

In [None]:
pd.set_option("display.max_rows", None)

In [None]:
len(df)

In [None]:
[i for i in df2['name']]

In [None]:
df

In [None]:
[a for a in df['artist'].unique()]

In [None]:
df.iloc[285]

## Cleaning Functions

In [None]:
### Imports
import string
import re 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
### Specific functions to standup/scraps from the loft
def remove_music(text):
    text = re.sub('♪.*?♪', '', text) # remove ♪ stuff that looks like this ♪
    return text

def remove_bracketed(text):
    text = re.sub('\[.*?\]', '', text) # remove [stuff that looks like this]
    text = re.sub('\(.*?\)', '', text) # remove (stuff that looks like this)
    return text

def remove_useless(text):
    text = re.sub('\n\w+\(\s\w+\)?\:\s', '', text) # remove Word: or Word word: with a newline before
    text = re.sub('subtitles? by \w+', '', text) # remove subtile(s) by xxxx
    return text

In [None]:
# general functions for text pre-processing
def remove_punc(text, chars):
    txt = text
    for punc in chars:
        txt = txt.replace(punc, '')
    return txt

def remove_num(text):
    return ''.join(char for char in text if not char.isdigit())

def remove_stopw(text, word_list):
    word_tokens = word_tokenize(text)
    return ' '.join(w for w in word_tokens if not w in word_list)

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split(' ') if len(lemmatizer.lemmatize(word))>2)

In [None]:
def manual_lemmatizer(text):
    text = text.replace('got', 'get').replace(
        'gon', 'go').replace(
        'said', 'say').replace(
        'fucking', 'fuck').replace(
        'went', 'go').replace(
    'finding', 'find')
    return text

In [None]:
from nltk.tokenize import word_tokenize

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

## Modifying & applying removal lists 

In [None]:
clean_df = df.copy()

### Remove everything in Brackets, Music notes

In [None]:
clean_df['full_transcript_clean'] = clean_df['full_transcript'].apply(remove_bracketed)

In [None]:
## TODO: Remove ♪ from specific comedians:
# [Bo Burnham]

In [None]:
# clean_df['full_transcript_clean'] = clean_df['full_transcript'].apply(remove_music)

### Lowercase, remove useless regex matches, numbers, stopwords and punctuation
Including specific scraps format

In [None]:
stopwords.words('english')

In [None]:
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].str.lower()

In [None]:
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_useless)

In [None]:
words_to_remove = ['thank', 'cheering', 'recorded', 'applause', 'laughter', 'laughing', 'murmuring', 'chatter',
                       'aired', 'filmed', 'ladies', 'gentlemen', "that's", "i'm", "don't"]
# other possible removals 'netflix special', 'full transcript' 

In [None]:
stopwords_plus = words_to_remove + stopwords.words('english')

In [None]:
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_num)

In [None]:
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_stopw, args=(stopwords_plus,))

In [None]:
punc_added = string.punctuation + '“”‘’…♪¶'

clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_punc, args=(punc_added,))

In [None]:
clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(lemmatize).apply(manual_lemmatizer)

### Remove numbers and stopwords + common comedy words

In [None]:
clean_df.head()

In [None]:
clean_df[clean_df.full_transcript.str.find('cazzo')!=-1]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
t = clean_df['full_transcript_clean'].astype(str)

vectorizer = CountVectorizer(min_df = 0.3, max_df = 0.8, max_features = 20)
X = vectorizer.fit_transform(t)
bow_df = pd.DataFrame(X.toarray(),columns = vectorizer.get_feature_names())
bow_df

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer().fit(clean_df['full_transcript_clean'])

data_vectorized = vectorizer.transform(clean_df['full_transcript_clean'])

lda_model = LatentDirichletAllocation(n_components=3).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

In [None]:
token_df = clean_df['full_transcript_clean'].apply(tokenize).astype(str)
token_df

In [None]:
pd.Series([y for x in token_df.values.flatten() for y in x.split()]).value_counts()

In [None]:
# frequent_words = ['like']
# clean_df['full_transcript_clean'] = clean_df['full_transcript_clean'].apply(remove_stopw, args=(frequent_words,))

In [None]:
full_transcripts = ' '.join(clean_df['full_transcript_clean'])

In [None]:
len(full_transcripts)

In [None]:
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(30, 20))
    # Display image
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off");

In [None]:
word_cloud = WordCloud(width=3000, height = 2000,
                       random_state=1, colormap='Pastel1',
                       collocations=False, stopwords = STOPWORDS).generate(full_transcripts)

In [None]:
plot_cloud(word_cloud)