In [1]:
import re    # for preprocessing
import pandas as pd    # for data handling
from time import time    # to time our operations
from collections import defaultdict    # for word frequency

import spacy    # for preprocessing

import logging    # setting up the loggings to mointor gensim
# Define the format of the log message. It includes log level, timestamp, and the actual log message
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt='%H:%M:%S', level=logging.INFO)

In [23]:
df = pd.read_csv('data/simpsons_dataset.csv')
df.shape

(158314, 2)

In [24]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [25]:
len(df['spoken_words'].values), df['spoken_words'].values

(158314,
 array(["No, actually, it was a little of both. Sometimes when a disease is in all the magazines and all the news shows, it's only natural that you think you have it.",
        "Where's Mr. Bergstrom?",
        "I don't know. Although I'd sure like to talk to him. He didn't touch my lesson plan. What did he teach you?",
        ..., 'Psy-cho-so-ma-tic.', 'Does that mean you were crazy?',
        'No, that means she was faking it.'], dtype=object))

In [26]:
# Check how many lines are null
print(df.isnull().sum())

# remove null values
df = df.dropna().reset_index(drop=True)

print(df.shape)

raw_character_text    17814
spoken_words          26459
dtype: int64
(131853, 2)


In [6]:
# In spaCy, a pipeline refers to a series of processing components that are applied to a text document sequentially. 
# Each component in the pipeline performs a specific task, such as tokenization, part-of-speech tagging, syntactic parsing, named entity recognition, and so on. 
# The output of one component serves as the input to the next component in the pipeline.
# When you load a spaCy model, it comes with a default pipeline of processing components that are applied to the text. 
# However, you can customize the pipeline by adding, removing, or modifying the components according to your specific requirements.
# The pipeline in spaCy is designed to be efficient and allows for fast processing of large volumes of text. 
# It takes advantage of the processing capabilities of spaCy's underlying machine learning models and linguistic data structures.
# You can access the current pipeline components of a loaded spaCy model using the nlp.pipe_names attribute. 
# Similarly, you can add or modify components in the pipeline using the nlp.add_pipe() or nlp.remove_pipe() methods respectively.
# 

In [6]:
# Spacy is a popular Python library used for natural language processing (NLP) tasks. 
# It provides a lemmatization module that allows you to convert words to their base or dictionary form, known as lemmas. 
# The Spacy lemmatizer is built on linguistic rules and utilizes contextual information to determine the appropriate lemma for a given word.
# Here's an example of how to use spaCy's lemmatizer
import spacy
nlp = spacy.load(name='en_core_web_sm')      # Load the English Language Model
txt = "I am running along Tancheon which is a small and narrow river for about two hours everyday, praying Christ Jesus to work inside me and help recover from my illness."

# Tokenize text into individual words
doc = nlp(txt)

# Lemmatize each word and print lemma
for token in doc:
    print(token.text, token.lemma_)

I I
am be
running run
along along
Tancheon Tancheon
which which
is be
a a
small small
and and
narrow narrow
river river
for for
about about
two two
hours hour
everyday everyday
, ,
praying pray
Christ Christ
Jesus Jesus
to to
work work
inside inside
me I
and and
help help
recover recover
from from
my my
illness illness
. .


In [8]:
print(type(doc))
print(doc)

<class 'spacy.tokens.doc.Doc'>
I am running along Tancheon which is a small and narrow river for about two hours everyday, praying Christ Jesus to work inside me and help recover from my illness.


In [27]:
# cleaning
# We are lemmatizing and removing the stopwords and non-alphabetic characters from each line of dialogues
# Load the English Language Model
nlp = spacy.load(name='en_core_web_lg', disable=['parser', 'ner'])     # disabling Named Entity Recognition and syntatic parser for speed

def cleaning(doc):
    # Lemmaitizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representations of a target word,
    # if a sentence is only one or two words long, the benefit for training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [28]:
# We first load the default English language model, and then we print the names of the current pipeline components using nlp.pipe_names. 
# We then add a new component called "sentencizer" before the "parser" component using nlp.add_pipe(). 
# Finally, we remove the "ner" (named entity recognition) component from the pipeline using nlp.remove_pipe().

# Print the current pipeline components
print(nlp.pipe_names)

# Add a new component called sentencizer before tagger
# nlp.add_pipe("sentencizer", before='tagger')

# Remove a component from the pipeline
# nlp.remove_pipe("ner")

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']


In [29]:
# Remove non-alphabetic characters
brief_cleaning = (re.sub("[^A-Za-z']+", " ", str(row)).lower() for row in df['spoken_words'])

In [11]:
# The nlp.pipe() function in spaCy is a convenient way to process a large amount of texts efficiently using a spaCy language model. 
# It takes in a sequence of texts and applies the processing pipeline to each text in parallel.

import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Define a list of texts to process
texts = ["This is the first text.", "Here comes the second text.", "And finally, the third text."]

# Process the texts using nlp.pipe()
docs = list(nlp.pipe(texts))
print(docs)
print()

# Access the processed documents
for doc in docs:
    # Perform any desired operations on each document
    print(doc.text)
    for token in doc:
        print(token.text, token.pos_, token.lemma_)
    print()

[This is the first text., Here comes the second text., And finally, the third text.]

This is the first text.
This PRON this
is AUX be
the DET the
first ADJ first
text NOUN text
. PUNCT .

Here comes the second text.
Here ADV here
comes VERB come
the DET the
second ADJ second
text NOUN text
. PUNCT .

And finally, the third text.
And CCONJ and
finally ADV finally
, PUNCT ,
the DET the
third ADJ third
text NOUN text
. PUNCT .



In [30]:
# Take advantage of spaCy.pipe() method to speed up the cleaning process
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

print(f'Time to clean up everything : {round((time() - t) / 60, 3)} mins')

Time to clean up everything : 1.566 mins


In [31]:
# Put the results in a DataFrame to remove missing values and duplicates:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85952, 1)

In [32]:
df_clean.head()

Unnamed: 0,clean
0,actually little disease magazine news show nat...
2,know sure like talk touch lesson plan teach
3,life worth live
4,poll open end recess case decide thought final...
7,victory party slide


##### Bigrams

In [34]:
# We are using Gensim Phrases package to automatically detect common phrases (bigrams) from a list of sentences.
from gensim.models.phrases import Phrases, Phraser

In [36]:
# As Phrases() takes a list of list of words as input:
sent = [row.split() for row in df_clean['clean']]

print(len(sent), len(df_clean['clean']))
print(sent[1000])

85952 85952
['hey', 'bartle', 'eeboobely', 'care', 'steak', 'rooney']


In [38]:
# Creates the relevant phrases from the list of sentences:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 20:19:15: collecting all words and their counts
INFO - 20:19:15: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 20:19:15: PROGRESS: at sentence #10000, processed 63557 words and 52676 word types
INFO - 20:19:15: PROGRESS: at sentence #20000, processed 130938 words and 99581 word types
INFO - 20:19:15: PROGRESS: at sentence #30000, processed 192957 words and 138111 word types
INFO - 20:19:15: PROGRESS: at sentence #40000, processed 249830 words and 172098 word types
INFO - 20:19:15: PROGRESS: at sentence #50000, processed 311271 words and 207908 word types
INFO - 20:19:15: PROGRESS: at sentence #60000, processed 373578 words and 242908 word types
INFO - 20:19:15: PROGRESS: at sentence #70000, processed 436426 words and 277808 word types
INFO - 20:19:15: PROGRESS: at sentence #80000, processed 497900 words and 310884 word types
INFO - 20:19:15: collected 329570 token types (unigram + bigrams) from a corpus of 537081 words and 85952 sentences
INFO - 20:19:15: m

In [39]:
# The goal of Phraser() is to cut down memory consumption of Phrases(), by discarding model state not strictly needed for the bigram detection task:
bigram = Phraser(phrases)

INFO - 20:22:53: exporting phrases from Phrases<329570 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 20:22:54: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<127 phrases, min_count=30, threshold=10.0> from Phrases<329570 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.68s', 'datetime': '2023-06-01T20:22:54.590879', 'gensim': '4.3.1', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [43]:
# Transform the corpus based on the bigrams detected
sentences = bigram[sent]

print(len(sentences))
print(sentences[1000])

85952
['hey', 'bartle', 'eeboobely', 'care', 'steak', 'rooney']


In [44]:
# Most Frequent Words
# Mainly a sanity check of the effectiveness of the lemmatization, removal of stopwords, and the addition of bigrams.
word_freq = defaultdict(int)

for sent in sentences:
    for i in sent:
        word_freq[i] += 1

print(len(word_freq))

29676


In [47]:
word_freq

defaultdict(int,
            {'actually': 422,
             'little': 2101,
             'disease': 45,
             'magazine': 122,
             'news': 249,
             'show': 214,
             'natural': 77,
             'think': 3593,
             'know': 4821,
             'sure': 1198,
             'like': 5598,
             'talk': 937,
             'touch': 192,
             'lesson': 162,
             'plan': 302,
             'teach': 324,
             'life': 1222,
             'worth': 141,
             'live': 768,
             'poll': 20,
             'open': 420,
             'end': 464,
             'recess': 10,
             'case': 215,
             'decide': 134,
             'thought': 120,
             'final': 105,
             'statement': 21,
             'martin': 120,
             'victory': 30,
             'party': 421,
             'slide': 48,
             'mr': 797,
             'bergstrom': 17,
             'hey': 3620,
             'move': 165,
     

In [51]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['oh', 'like', 'know', 'get', 'hey', 'think', 'come', 'right', 'look', 'want']

In [65]:
import operator
word_freq_by_key = sorted(word_freq.items(), key=operator.itemgetter(0), reverse=False)[:10]
print(word_freq_by_key)

[("'", 2779), ("'_n", 53), ("'_til", 82), ("'_tis", 35), ("'bout", 168), ("'cause", 470), ("'cuz", 7), ("a'hind", 1), ("a'ight", 2), ("a'twain", 1)]


In [63]:
word_freq_by_value = sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)[:10]
print(word_freq_by_value)

[('oh', 6453), ('like', 5598), ('know', 4821), ('get', 4213), ('hey', 3620), ('think', 3593), ('come', 3583), ('right', 3411), ('look', 3382), ('want', 3181)]
