# Data preprocessing steps:

## 1. Process Rick and Morty subtitles
## 2. Clean data by:
### (I.) lemmatise words
### (II.) remove stopwords
### (III.) clean nonalphabetic words

In [100]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
from unidecode import unidecode

import spacy  # For preprocessing
import srt

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

### Enlarge pandas columns with for a better display

In [101]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.expand_frame_repr', False)

### Obtain subtitle files

In [79]:
import os

path = 'data/rick_and_morty'

files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        if file.endswith('.srt') and 'ipynb_checkpoints' not in os.path.join(r, file):
            files.append(os.path.join(r, file))

print(len(files))

31


### Open file as a Pandas dataframe

In [142]:
df = pd.read_csv("data/simpsons_dataset.csv")

In [144]:
df.columns

Index(['raw_character_text', 'spoken_words'], dtype='object')

In [143]:
df.shape

(158314, 2)

In [124]:
df.groupby(['raw_character_text'])['spoken_words'].count().reset_index().sort_values(by='spoken_words', ascending=False)[:10]

Unnamed: 0,raw_character_text,spoken_words
2879,Homer Simpson,27850
3879,Marge Simpson,13172
648,Bart Simpson,12995
3567,Lisa Simpson,10756
944,C. Montgomery Burns,3077
4066,Moe Szyslak,2808
5506,Seymour Skinner,2385
4315,Ned Flanders,2056
2599,Grampa Simpson,1802
1193,Chief Wiggum,1790


In [105]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometimes when a disease is in all the magazines and all the news shows, it's only natural that you think you have it."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk to him. He didn't touch my lesson plan. What did he teach you?
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,"The polls will be open from now until the end of recess. Now, just in case any of you have decided to put any thought into this, we'll have our final statements. Martin?"


#### **cleaning**: _Lemmatizes and removes stopwords_
#### **cleaning**: _Removes sentences with less than 2 words (Word2Vec uses context words to learn the vector representation of a target word)_
#### **clean_nonalphabetic**: _Removes words with nonalphabetic characters_

##  dog and dogs

In [136]:
nlp = spacy.load('en', disable=['ner', 'parser'])

def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]

    if len(txt) > 2:
        return ' '.join(txt).strip()

def clean_nonalphabetic(row):
    return re.sub("[^A-Za-z']+", ' ', str(row)).lower()

### Start the preprocessing procedure while using spacy as NLP pipeline, to apply a series of algorihms such as: 
#### tokenisation, lemmatisation and custom preprocessing functions

In [145]:
t = time()

cleaning_data = ((clean_nonalphabetic(str(row))) for row in df['spoken_words'])
txt = [cleaning(doc) for doc in nlp.pipe(cleaning_data, batch_size=5000, n_threads=12)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.99 mins


#### Prepare a new dataframe with cleansed dialogs

In [139]:
df_clean = pd.DataFrame({"spoken_words": txt})
df_clean = df_clean.dropna().drop_duplicates()

### Save dataframe

In [140]:
df_clean.shape

(86834, 1)

In [None]:
df_clean.head()

In [141]:
df_clean.to_csv("data/processed_simpsons_dataset.csv", index=False)