In [1]:
# To download a .txt file from a URL in Python, you can use the `requests` library.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [3]:
# This function takes the URL of the .txt file and the desired path as input.
# It sends a GET request to the URL using `requests.get(ulr)`. If the response status code is 200, it indicates a successful request.
# It saves the response content to the specified file paht using a binary write mode(`wb`).
# Finally, it prints a success message or a failure message based on the result.
# Make sure to replace the `url` and `save_path` variable with the actual URL of the .txt you want to download and the desired save path on your system.

def download_txt_file(url, save_path):
    response = requests.get(url)
    
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print("File downloaded successfully!")
        
    else:
        print("Failed to download the file.")

In [4]:
url = "http://www.awesomefilm.com/script/THEGODFATHER.txt"
save_path = "data/theGodFather.txt"

download_txt_file(url, save_path)

File downloaded successfully!


In [37]:
# To parse an HTML web page, you can use various programming languages and libraries, such as Python with BeautifulSoup or lxml, JavaScript with Cheerio.
# Send a request to the web page and retrieve its HTML content
url = "https://imsdb.com/scripts/Godfather.html"
response = requests.get(url)
html_content = response.content

# Create a BeautifulSoup object to parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

# Use BeautifulSoup's methods to extract the desired information from the HTML structure.
# For example, you can find elements by their tags, classes, IDs, or other attributes.
# Find the <pre> tag
scripts = soup.find_all("b")

In [38]:
# Extract the text content of the <br> tag.
raw_character_text_list = []
spoken_words_list = []

for index, s in enumerate(scripts):
    raw_character_text_list.append(s.text)
    spoken_words_list.append(s.next_sibling)   

In [39]:
godfather_df = pd.DataFrame({"raw_character_text" : raw_character_text_list, "spoken_words" : spoken_words_list})

godfather_df

Unnamed: 0,raw_character_text,spoken_words
0,\tTHE GODFATHER\r\n,[\t_____________\r\n]
1,\t_____________\r\n,\r\n\tScreenplay\r\n\r\n\tby\r\n\r\n
2,\tMARIO PUZO\r\n,\r\n\tand\r\n\r\n
3,\tFRANCIS FORD COPPOLA\r\n,\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r...
4,THIRD DRAFT\t\t\t\tPARAMOUNT PICTURES\r\n,\t\t\t\t\t1 Gulf and Western Plaza\r\nMarch 29...
...,...,...
1188,\t\t\t\tCLEMENZA\r\n,\t\tDon Corleone...\r\n\r\n\tThe smile fades f...
1189,\tINT DAY: CHURCH (1955)\r\n,\r\n\tKAY wears a shawl over her hand. She dr...
1190,\t\t\t\t\tTHE END\r\n,\r\n\r\n\r\n\r\n
1191,Writers,:


In [40]:
godfather_df.loc[8, 'spoken_words']

'\t\tI raised my daughter in the American\r\n\t\tfashion; I gave her freedom, but\r\n\t\ttaught her never to dishonor her\r\n\t\tfamily.  She found a boy friend,\r\n\t\tnot an Italian.  She went to the\r\n\t\tmovies with him, stayed out late.\r\n\t\tTwo months ago he took her for a\r\n\t\tdrive, with another boy friend.\r\n\t\tThey made her drink whiskey and\r\n\t\tthen they tried to take advantage\r\n\t\tof her.  She resisted; she kept her\r\n\t\thonor.  So they beat her like an\r\n\t\tanimal.  When I went to the hospital\r\n\t\ther nose was broken, her jaw was\r\n\t\tshattered and held together by\r\n\t\twire, and she could not even weep\r\n\t\tbecause of the pain.\r\n\r\n\tHe can barely speak; he is weeping now.\r\n\r\n'

In [41]:
def remove_non_alphabetic_characters(row):
    cleaning_text = re.sub("[\n\r\t_]+", " ", str(row)).lower()
    
    return cleaning_text

In [42]:
for i, row in godfather_df.iterrows():
    print(i, row)
    godfather_df.loc[i, 'raw_character_text'] = remove_non_alphabetic_characters(row['raw_character_text'])
    godfather_df.loc[i, 'spoken_words'] = remove_non_alphabetic_characters(row['spoken_words'])

0 raw_character_text      \tTHE GODFATHER\r\n
spoken_words          [\t_____________\r\n]
Name: 0, dtype: object
1 raw_character_text                     \t_____________\r\n
spoken_words          \r\n\tScreenplay\r\n\r\n\tby\r\n\r\n
Name: 1, dtype: object
2 raw_character_text     \tMARIO PUZO\r\n
spoken_words          \r\n\tand\r\n\r\n
Name: 2, dtype: object
3 raw_character_text                           \tFRANCIS FORD COPPOLA\r\n
spoken_words          \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r...
Name: 3, dtype: object
4 raw_character_text            THIRD DRAFT\t\t\t\tPARAMOUNT PICTURES\r\n
spoken_words          \t\t\t\t\t1 Gulf and Western Plaza\r\nMarch 29...
Name: 4, dtype: object
5 raw_character_text            \tINT DAY: DON'S OFFICE (SUMMER 1945)\r\n
spoken_words          \r\n\tThe PARAMOUNT Logo is presented austerel...
Name: 5, dtype: object
6 raw_character_text                          \t\t\t\t  THE GODFATHER\r\n
spoken_words          \r\n\tWhile this remains, we hear: "

In [43]:
godfather_df

Unnamed: 0,raw_character_text,spoken_words
0,the godfather,<b> </b>
1,,screenplay by
2,mario puzo,and
3,francis ford coppola,
4,third draft paramount pictures,"1 gulf and western plaza march 29, 1971 new y..."
...,...,...
1188,clemenza,don corleone... the smile fades from kay's fa...
1189,int day: church (1955),kay wears a shawl over her hand. she drops m...
1190,the end,
1191,writers,:


In [44]:
godfather_df.loc[8, "spoken_words"]

' i raised my daughter in the american fashion; i gave her freedom, but taught her never to dishonor her family.  she found a boy friend, not an italian.  she went to the movies with him, stayed out late. two months ago he took her for a drive, with another boy friend. they made her drink whiskey and then they tried to take advantage of her.  she resisted; she kept her honor.  so they beat her like an animal.  when i went to the hospital her nose was broken, her jaw was shattered and held together by wire, and she could not even weep because of the pain. he can barely speak; he is weeping now. '

In [54]:
godfather_characters = godfather_df['raw_character_text'].value_counts().to_frame("count").index[:15].values.tolist()
print(godfather_characters)

[' michael ', ' don corleone ', ' sonny ', ' hagen ', ' kay ', ' clemenza ', ' sollozzo ', ' connie ', ' fabrizzio ', ' bonasera ', ' carlo ', ' tessio ', ' fredo ', ' paulie ', ' mama ']


In [73]:
godfather_main_characters_df = pd.DataFrame()

In [74]:
for i, row in godfather_df.iterrows():
    if row["raw_character_text"] in godfather_characters:
        godfather_main_characters_df.loc[i, "raw_character_text"] = row["raw_character_text"]
        godfather_main_characters_df.loc[i, "spoken_words"] = row["spoken_words"]

In [75]:
godfather_main_characters_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 738 entries, 7 to 1188
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   raw_character_text  738 non-null    object
 1   spoken_words        738 non-null    object
dtypes: object(2)
memory usage: 33.5+ KB


In [76]:
godfather_main_characters_df["raw_character_text"].value_counts()

 michael          204
 don corleone     103
 sonny             94
 hagen             85
 kay               57
 clemenza          42
 sollozzo          33
 connie            19
 fabrizzio         18
 bonasera          17
 carlo             16
 fredo             14
 tessio            14
 paulie            11
 mama              11
Name: raw_character_text, dtype: int64

In [77]:
godfather_main_characters_df = godfather_main_characters_df.reset_index(inplace=False, drop=True)

In [78]:
godfather_main_characters_df.head()

Unnamed: 0,raw_character_text,spoken_words
0,bonasera,"america has made my fortune. as he speaks, th..."
1,bonasera,i raised my daughter in the american fashion;...
2,bonasera,i went to the police like a good american. t...
3,don corleone,"bonasera, we know each other for years, but t..."
4,bonasera,what do you want of me? i'll give you anythi...
