In [13]:
import requests
from bs4 import BeautifulSoup
import pickle
import pandas as pd

In [4]:
## scraping url

def url_to_transcript(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    print(url)
    return text
    

##urls for this website
urls = [
    'https://www.artofmanliness.com/we-shall-fight-on-the-beaches-by-winston-churchill/',
    'https://www.artofmanliness.com/inaugrual-address-of-john-f-kennedy/',
    'https://www.artofmanliness.com/first-inaugural-address-of-franklin-d-roosevelt/',
    'https://www.artofmanliness.com/remarks-at-the-brandenburg-gate-by-ronald-regan/',
    'https://www.artofmanliness.com/the-meaning-of-july-fourth-for-the-negro-by-frederick-douglas/',
    'https://www.artofmanliness.com/i-have-a-dream-by-dr-martin-luther-king/', 
    'https://www.artofmanliness.com/duties-of-american-citizenship-by-theodore-roosevelt/',
    'https://www.artofmanliness.com/quit-india-speech-by-ghandi/'
]

orators = ['Churchill', 'jfk', 'fdr', 'reagon', 'fredrick','mlk', 'teddy','ghandi' ]

In [5]:
transcripts = [url_to_transcript(u) for u in urls]

https://www.artofmanliness.com/we-shall-fight-on-the-beaches-by-winston-churchill/
https://www.artofmanliness.com/inaugrual-address-of-john-f-kennedy/
https://www.artofmanliness.com/first-inaugural-address-of-franklin-d-roosevelt/
https://www.artofmanliness.com/remarks-at-the-brandenburg-gate-by-ronald-regan/
https://www.artofmanliness.com/the-meaning-of-july-fourth-for-the-negro-by-frederick-douglas/
https://www.artofmanliness.com/i-have-a-dream-by-dr-martin-luther-king/
https://www.artofmanliness.com/duties-of-american-citizenship-by-theodore-roosevelt/
https://www.artofmanliness.com/quit-india-speech-by-ghandi/


In [6]:
# pickle files for later use

#Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(orators):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)


mkdir: transcripts: File exists


In [7]:
# combine speakers/orators into one list

# load data into dictionary and map to allspeakers
data = {}

for i, orators in enumerate(orators):
    with open("transcripts/" + orators + ".txt", "r") as file:
        data[orators] = pickle.load(file)

In [69]:
data['Churchill']

[u'June 4, 1940',
 u'House of Commons',
 u'From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the second week of May, only a rapid retreat to Amiens and the south could have saved the British and French Armies who had entered Belgium at the appeal of the Belgian King; but this strategic fact was not immediately realized. The French High Command hoped they would be able to close the gap, and the Armies of the north were under their orders. Moreover, a retirement of this kind would have involved almost certainly the destruction of the fine Belgian Army of over 20 divisions and the abandonment of the whole of Belgium. Therefore, when the force and scope of the German penetration were realized and when a new French Generalissimo, General Weygand, assumed command in place of General Gamelin, an effort was made by the French and British Armies in Belgium to keep on holding the right hand of the Belgians and to give their own right hand to a newly cre

## Cleaning Data

In [10]:
# We are going to change this to key: speaker, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [11]:
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [14]:
#making a corpus here - a collection of texts, and they are all put together neatly in a pandas dataframe 
data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['speech']
data_df = data_df.sort_index()
data_df

Unnamed: 0,speech
Churchill,"June 4, 1940 House of Commons From the moment ..."
fdr,"March 4, 1933 President Hoover, Mr. Chief Just..."
fredrick,"July 5, 1852 Fellow Citizens, I am not wanting..."
ghandi,"August 8, 1942 Before you discuss the resoluti..."
jfk,"January 20, 1961 Vice President Johnson, Mr. S..."
mlk,"August 23, 1968 I am happy to join with you to..."
reagon,"June 12, 1987 Thank you. Thank you, very much...."
teddy,"Buffalo, New York, January 26, 1883 Of course..."


In [17]:
## checking data frame speeches
##data_df.speech.loc['fdr']

In [18]:
import re
import string

In [19]:
def clean_text_round1(text):
    text = text.lower() # 'Make text lowercase
    text = re.sub('\[.*?\]', '', text) # remove text in square brackets
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub('\w*\d\w*', '', text) #remove words containing numbers

    return text

round1 = lambda x: clean_text_round1(x)

In [22]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.speech.apply(round1))
data_clean

Unnamed: 0,speech
Churchill,june house of commons from the moment that t...
fdr,march president hoover mr chief justice my f...
fredrick,july fellow citizens i am not wanting in res...
ghandi,august before you discuss the resolution let...
jfk,january vice president johnson mr speaker mr...
mlk,august i am happy to join with you today in ...
reagon,june thank you thank you very much chancello...
teddy,buffalo new york january of course in one s...


In [23]:
# Apply a second round of cleaning
def clean_text_round2(text):
    #Get rid of some additional punctuation and non-sensical text that was missed the first time around.
    text = re.sub('[‘’“”…]', '', text) # 
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [25]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.speech.apply(round2))
data_clean

Unnamed: 0,speech
Churchill,june house of commons from the moment that t...
fdr,march president hoover mr chief justice my f...
fredrick,july fellow citizens i am not wanting in res...
ghandi,august before you discuss the resolution let...
jfk,january vice president johnson mr speaker mr...
mlk,august i am happy to join with you today in ...
reagon,june thank you thank you very much chancello...
teddy,buffalo new york january of course in one s...


## Organizing The Data

In [26]:
data_df

Unnamed: 0,speech
Churchill,"June 4, 1940 House of Commons From the moment ..."
fdr,"March 4, 1933 President Hoover, Mr. Chief Just..."
fredrick,"July 5, 1852 Fellow Citizens, I am not wanting..."
ghandi,"August 8, 1942 Before you discuss the resoluti..."
jfk,"January 20, 1961 Vice President Johnson, Mr. S..."
mlk,"August 23, 1968 I am happy to join with you to..."
reagon,"June 12, 1987 Thank you. Thank you, very much...."
teddy,"Buffalo, New York, January 26, 1883 Of course..."


In [27]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")

## Document-Term Matrix

The text must be tokenized, meaning broken down into smaller pieces. The most common tokenization technique is to break down text into words. We can do this using scikit-learn's CountVectorizer, where every row will represent a different document and every column will represent a different word.

In addition, with CountVectorizer, we can remove stop words. Stop words are common words that add no additional meaning to text such as 'a', 'the', etc.

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.speech)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,abandon,abandonment,abbeville,abdicated,abiding,ability,able,abler,abolish,abolitionists,...,yelling,yes,yesterday,yielded,yielding,yoke,york,young,youth,zion
Churchill,0,1,1,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,2,1,0
fdr,0,1,0,1,0,1,0,0,0,0,...,0,2,0,0,0,0,0,1,0,0
fredrick,0,0,0,0,0,1,0,0,0,1,...,0,0,1,0,0,1,0,0,0,2
ghandi,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
jfk,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
mlk,0,0,0,0,0,0,8,0,0,0,...,0,1,0,0,0,0,2,0,0,0
reagon,0,0,0,0,1,0,1,0,0,0,...,0,4,0,0,0,0,0,4,1,0
teddy,1,0,0,0,0,1,5,1,0,0,...,1,0,0,1,1,0,6,6,1,0


In [32]:
# pickle data matrix for later use
data_dtm.to_pickle("dtm.pkl")

In [33]:
data_clean.to_pickle('data_clean.pk1')
pickle.dump(cv, open("cv.pkl", "wb"))