In [1]:
import re    # for preprocessing
import pandas as pd    # for data handling
from time import time    # to time our operations
from collections import defaultdict    # for word frequency

import spacy    # for preprocessing

import logging    # setting up the loggings to mointor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt='%H:%M:%S', level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('data/simpsons_dataset.csv')
df.shape

(158314, 2)

In [4]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [5]:
# Check how many lines are null
print(df.isnull().sum())

# remove null values
df = df.dropna().reset_index(drop=True)

print(df.shape)

raw_character_text    17814
spoken_words          26459
dtype: int64
(131853, 2)


In [6]:
# In spaCy, a pipeline refers to a series of processing components that are applied to a text document sequentially. 
# Each component in the pipeline performs a specific task, such as tokenization, part-of-speech tagging, syntactic parsing, named entity recognition, and so on. 
# The output of one component serves as the input to the next component in the pipeline.
# When you load a spaCy model, it comes with a default pipeline of processing components that are applied to the text. 
# However, you can customize the pipeline by adding, removing, or modifying the components according to your specific requirements.
# The pipeline in spaCy is designed to be efficient and allows for fast processing of large volumes of text. 
# It takes advantage of the processing capabilities of spaCy's underlying machine learning models and linguistic data structures.
# You can access the current pipeline components of a loaded spaCy model using the nlp.pipe_names attribute. 
# Similarly, you can add or modify components in the pipeline using the nlp.add_pipe() or nlp.remove_pipe() methods respectively.
# 

In [15]:
# cleaning
# We are lemmatizing and removing the stopwords and non-alphabetic characters from each line of dialogues
# Load the English Language Model
nlp = spacy.load(name='en_core_web_lg')     # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmaitizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representations of a target word,
    # if a sentence is only one or two words long, the benefit for training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [16]:
# Print the current pipeline components
print(nlp.pipe_names)

# Add a new component called sentencizer before tagger
nlp.add_pipe("sentencizer", before='tagger')

# Remove a component from the pipeline
nlp.remove_pipe("lemmatizer")

print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
['tok2vec', 'sentencizer', 'tagger', 'parser', 'attribute_ruler', 'ner']


In [7]:
# Remove non-alphabetic characters
brief_cleaning = (re.sub("[^A-Za-z']+", " ", str(row)).lower() for row in df['spoken_words'])

In [8]:
# Take advantage of spaCy.pipe() attribute to speed up the cleaning process
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

print(f'Time to clean up everything : {round(time() - t) / 60, 2} mins')

Time to clean up everything : (1.5, 2) mins


In [10]:
# Put the results in a DataFrame to remove missing values and duplicates:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85955, 1)

In [11]:
df_clean.head()

Unnamed: 0,clean
0,actually little disease magazine news show nat...
2,know sure like talk touch lesson plan teach
3,life worth live
4,poll open end recess case decide thought final...
7,victory party slide
