In [37]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
from operator import itemgetter

import spacy
from time import time

from gensim.models.phrases import Phrases, Phraser

In [2]:
# Python with requests and BeautifulSoup libraries is the best choice to parse an HTML page in the web.
url = "https://imsdb.com/scripts/Godfather.html"
response = requests.get(url)
html_content = response.content

# Create a BeautifulSoup object to parse an HTML
soup = BeautifulSoup(markup = html_content, features = "html.parser")

# Find all b's in the content
all_bs = soup.body.find_all('table')[1].tr.pre.pre.find_all('b')

In [3]:
all_bs

[<b>	THE GODFATHER
 </b>,
 <b>	_____________
 </b>,
 <b>	MARIO PUZO
 </b>,
 <b>	FRANCIS FORD COPPOLA
 </b>,
 <b>THIRD DRAFT				PARAMOUNT PICTURES
 </b>,
 <b>	INT DAY: DON'S OFFICE (SUMMER 1945)
 </b>,
 <b>				  THE GODFATHER
 </b>,
 <b>				BONASERA
 </b>,
 <b>				BONASERA
 </b>,
 <b>				BONASERA
 </b>,
 <b>				DON CORLEONE
 </b>,
 <b>				BONASERA
 </b>,
 <b>				DON CORLEONE
 </b>,
 <b>				DON CORLEONE
 </b>,
 <b>				BONASERA
 </b>,
 <b>				DON CORLEONE
 </b>,
 <b>				BONASERA
 </b>,
 <b>				DON CORLEONE
 </b>,
 <b>				BONASERA
 </b>,
 <b>				DON CORLEONE
 </b>,
 <b>				BONASERA
 </b>,
 <b>				DON CORLEONE
 </b>,
 <b>				BONASERA
 </b>,
 <b>				DON CORLEONE
 </b>,
 <b>				BONASERA
 </b>,
 <b>				DON CORLEONE
 </b>,
 <b>	EXT DAY: MALL (SUMMER 1945)
 </b>,
 <b>				DON CORLEONE
 </b>,
 <b>				SONNY
 </b>,
 <b>				DON CORLEONE
 </b>,
 <b>				SONNY
 </b>,
 <b>				SANDRA
 </b>,
 <b>	EXT DAY: MALL ENTRANCE (SUMMER 1945)
 </b>,
 <b>				CLEMENZA
 </b>,
 <b>				PAULIE
 </b>,
 <b>				CLEMENZA
 </b

In [4]:
raw_character_text_list, spoken_words_list = [], []

for idx, s in enumerate(all_bs):
    raw_character_text_list.append(s.text)
    spoken_words_list.append(s.next_sibling)   

In [5]:
gf_df = pd.DataFrame({'raw_character_text' : raw_character_text_list, 'spoken_words' : spoken_words_list})

gf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1191 entries, 0 to 1190
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   raw_character_text  1191 non-null   object
 1   spoken_words        1191 non-null   object
dtypes: object(2)
memory usage: 18.7+ KB


In [6]:
gf_df.head()

Unnamed: 0,raw_character_text,spoken_words
0,\tTHE GODFATHER\r\n,[\t_____________\r\n]
1,\t_____________\r\n,\r\n\tScreenplay\r\n\r\n\tby\r\n\r\n
2,\tMARIO PUZO\r\n,\r\n\tand\r\n\r\n
3,\tFRANCIS FORD COPPOLA\r\n,\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r...
4,THIRD DRAFT\t\t\t\tPARAMOUNT PICTURES\r\n,\t\t\t\t\t1 Gulf and Western Plaza\r\nMarch 29...


In [7]:
gf_df.loc[8, 'spoken_words']

'\t\tI raised my daughter in the American\r\n\t\tfashion; I gave her freedom, but\r\n\t\ttaught her never to dishonor her\r\n\t\tfamily.  She found a boy friend,\r\n\t\tnot an Italian.  She went to the\r\n\t\tmovies with him, stayed out late.\r\n\t\tTwo months ago he took her for a\r\n\t\tdrive, with another boy friend.\r\n\t\tThey made her drink whiskey and\r\n\t\tthen they tried to take advantage\r\n\t\tof her.  She resisted; she kept her\r\n\t\thonor.  So they beat her like an\r\n\t\tanimal.  When I went to the hospital\r\n\t\ther nose was broken, her jaw was\r\n\t\tshattered and held together by\r\n\t\twire, and she could not even weep\r\n\t\tbecause of the pain.\r\n\r\n\tHe can barely speak; he is weeping now.\r\n\r\n'

In [8]:
def remove_non_alphabetic_characters(row):
    cleaning_text = re.sub("[\n\r\t_;]+", " ", str(row)).lower()
    return cleaning_text

In [9]:
for i, row in gf_df.iterrows():
    gf_df.loc[i, 'raw_character_text'] = remove_non_alphabetic_characters(row['raw_character_text'])
    gf_df.loc[i, 'spoken_words'] = remove_non_alphabetic_characters(row['spoken_words'])

In [10]:
gf_df.head()

Unnamed: 0,raw_character_text,spoken_words
0,the godfather,<b> </b>
1,,screenplay by
2,mario puzo,and
3,francis ford coppola,
4,third draft paramount pictures,"1 gulf and western plaza march 29, 1971 new y..."


In [11]:
gf_df.loc[8, 'spoken_words']

' i raised my daughter in the american fashion  i gave her freedom, but taught her never to dishonor her family.  she found a boy friend, not an italian.  she went to the movies with him, stayed out late. two months ago he took her for a drive, with another boy friend. they made her drink whiskey and then they tried to take advantage of her.  she resisted  she kept her honor.  so they beat her like an animal.  when i went to the hospital her nose was broken, her jaw was shattered and held together by wire, and she could not even weep because of the pain. he can barely speak  he is weeping now. '

In [12]:
gf_main_chracters = gf_df['raw_character_text'].value_counts().head(15).index.to_list()
print(gf_main_chracters)

[' michael ', ' don corleone ', ' sonny ', ' hagen ', ' kay ', ' clemenza ', ' sollozzo ', ' connie ', ' fabrizzio ', ' bonasera ', ' carlo ', ' tessio ', ' fredo ', ' paulie ', ' mama ']


In [13]:
gf_main_characters_df = pd.DataFrame()

In [14]:
for i, row in gf_df.iterrows():
    if row['raw_character_text'] in gf_main_chracters:
        gf_main_characters_df.loc[i, 'raw_character_text'] = row['raw_character_text']
        gf_main_characters_df.loc[i, 'spoken_words'] = row['spoken_words']

In [15]:
gf_main_characters_df = gf_main_characters_df.reset_index(inplace=False, drop=True)

In [16]:
gf_main_characters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 738 entries, 0 to 737
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   raw_character_text  738 non-null    object
 1   spoken_words        738 non-null    object
dtypes: object(2)
memory usage: 11.7+ KB


In [17]:
gf_main_characters_df['raw_character_text'].value_counts()

 michael          204
 don corleone     103
 sonny             94
 hagen             85
 kay               57
 clemenza          42
 sollozzo          33
 connie            19
 fabrizzio         18
 bonasera          17
 carlo             16
 fredo             14
 tessio            14
 paulie            11
 mama              11
Name: raw_character_text, dtype: int64

In [18]:
gf_main_characters_df.head()

Unnamed: 0,raw_character_text,spoken_words
0,bonasera,"america has made my fortune. as he speaks, th..."
1,bonasera,i raised my daughter in the american fashion ...
2,bonasera,i went to the police like a good american. t...
3,don corleone,"bonasera, we know each other for years, but t..."
4,bonasera,what do you want of me? i'll give you anythi...


In [19]:
gf_main_characters_df['spoken_words'].values

array([' america has made my fortune. as he speaks, the view imperceptibly begins to loosen. ',
       ' i raised my daughter in the american fashion  i gave her freedom, but taught her never to dishonor her family.  she found a boy friend, not an italian.  she went to the movies with him, stayed out late. two months ago he took her for a drive, with another boy friend. they made her drink whiskey and then they tried to take advantage of her.  she resisted  she kept her honor.  so they beat her like an animal.  when i went to the hospital her nose was broken, her jaw was shattered and held together by wire, and she could not even weep because of the pain. he can barely speak  he is weeping now. ',
       " i went to the police like a good american.  these two boys were arrested and brought to trial.  the judge sentenced them to three years in prison, and suspended the sentence.  suspended sentence! they went free that very day.  i stood in the courtroom like a fool, and those bastards,

In [20]:
# Check how many lines are null
print(gf_main_characters_df.isnull().sum())

raw_character_text    0
spoken_words          0
dtype: int64


In [23]:
# We are lemmatizing and removing the stopwords and non-alphabetic characters from each line of dialogues
# Load the English language model
nlp = spacy.load(name = 'en_core_web_lg', disable = ['parser', 'ner'])

def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

In [24]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']


In [30]:
# Remove non-alphabetic characters
brief_cleaning = (re.sub("[^A-Za-z']+", " ", str(row)).lower() for row in gf_main_characters_df['spoken_words'])

In [31]:
# Take advantage of spaCy.pipe() method to speed up the cleaning process.
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=100)]

print(f"Time to clean up everything : {round((time() - t) / 60, 3)} mins")

Time to clean up everything : 0.024 mins


In [34]:
# Put the results in a DataFrame to remove missing values and duplicates
gf_df_clean = pd.DataFrame({'clean' : txt})
gf_df_clean = df_clean.dropna().drop_duplicates()
gf_df_clean.shape

(631, 1)

In [35]:
gf_df_clean.to_csv('data/gf_df_clean.csv', index=False)

In [36]:
gf_df_clean.head()

Unnamed: 0,clean
0,america fortune speak view imperceptibly beg...
1,raise daughter american fashion give freedom...
2,go police like good american boy arrest brin...
3,bonasera know year time come help remember t...
4,want want ask
