# Web Scraping & Regular Expression & Pandas!!!

## Looking at A LOT of text, and breaking it up

### MLK Jr Speech - what words he used the most, get count of words

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
url = 'http://www.analytictech.com/mb021/mlk.htm'

page = requests.get(url)

soup = BeautifulSoup(page.text, 'html.parser')

In [None]:
print(soup)

In [5]:
mlkjr_speech = soup.find_all('p')

In [8]:
speech_combined = [p.text.strip() for p in mlkjr_speech]
print(speech_combined)

# Using p.text.strip() we can get a cleaner output,
# but this still doesn't look great...
# We still have commas, and \r\n breaks...

['I am happy to join with you today in what will go down in\r\nhistory as the greatest demonstration for freedom in the history\r\nof our nation.', 'Five score years ago a great American in whose symbolic shadow\r\nwe stand today signed the Emancipation Proclamation. This\r\nmomentous decree came as a great beckoning light of hope to\r\nmillions of Negro slaves who had been seared in the flames of\r\nwithering injustice. It came as a joyous daybreak to end the long\r\nnight of their captivity.', 'But one hundred years later the Negro is still not free. One\r\nhundred years later the life of the Negro is still sadly crippled\r\nby the manacles of segregation and the chains of discrimination.', 'One hundred years later the Negro lives on a lonely island of\r\npoverty in the midst of a vast ocean of material prosperity.', 'One hundred years later the Negro is still languishing in the\r\ncomers of American society and finds himself in exile in his own\r\nland.', "We all have come to this h

In [9]:
# We're going to use the join function

string_speech = ' '.join(speech_combined)
string_speech

'I am happy to join with you today in what will go down in\r\nhistory as the greatest demonstration for freedom in the history\r\nof our nation. Five score years ago a great American in whose symbolic shadow\r\nwe stand today signed the Emancipation Proclamation. This\r\nmomentous decree came as a great beckoning light of hope to\r\nmillions of Negro slaves who had been seared in the flames of\r\nwithering injustice. It came as a joyous daybreak to end the long\r\nnight of their captivity. But one hundred years later the Negro is still not free. One\r\nhundred years later the life of the Negro is still sadly crippled\r\nby the manacles of segregation and the chains of discrimination. One hundred years later the Negro lives on a lonely island of\r\npoverty in the midst of a vast ocean of material prosperity. One hundred years later the Negro is still languishing in the\r\ncomers of American society and finds himself in exile in his own\r\nland. We all have come to this hallowed spot to 

In [None]:
# At this point, we're just cleaning data

string_speech.replace('\r\n', ' ')

In [13]:
a_lil_clean = string_speech.replace('\r\n', ' ')

In [None]:
a_lil_clean.replace(r"\'", "'")

In [16]:
print("we're" in a_lil_clean)  # Should be True

True


In [17]:
print("God's" in a_lil_clean)  # Should be True


True


In [18]:
print(a_lil_clean)


I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation. Five score years ago a great American in whose symbolic shadow we stand today signed the Emancipation Proclamation. This momentous decree came as a great beckoning light of hope to millions of Negro slaves who had been seared in the flames of withering injustice. It came as a joyous daybreak to end the long night of their captivity. But one hundred years later the Negro is still not free. One hundred years later the life of the Negro is still sadly crippled by the manacles of segregation and the chains of discrimination. One hundred years later the Negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity. One hundred years later the Negro is still languishing in the comers of American society and finds himself in exile in his own land. We all have come to this hallowed spot to remind America of the fierce urgency 

In [19]:
import re

In [20]:

# Step 1: Lowercase everything
cleaned = a_lil_clean.lower()
cleaned

'i am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation. five score years ago a great american in whose symbolic shadow we stand today signed the emancipation proclamation. this momentous decree came as a great beckoning light of hope to millions of negro slaves who had been seared in the flames of withering injustice. it came as a joyous daybreak to end the long night of their captivity. but one hundred years later the negro is still not free. one hundred years later the life of the negro is still sadly crippled by the manacles of segregation and the chains of discrimination. one hundred years later the negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity. one hundred years later the negro is still languishing in the comers of american society and finds himself in exile in his own land. we all have come to this hallowed spot to remind america of the fierce urgency

In [21]:
# Step 2: Remove punctuation, but KEEP apostrophes inside words
# This removes anything that isn't a letter, number, apostrophe, or whitespace
cleaned = re.sub(r"[^\w\s']+", '', cleaned)
cleaned

"i am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation five score years ago a great american in whose symbolic shadow we stand today signed the emancipation proclamation this momentous decree came as a great beckoning light of hope to millions of negro slaves who had been seared in the flames of withering injustice it came as a joyous daybreak to end the long night of their captivity but one hundred years later the negro is still not free one hundred years later the life of the negro is still sadly crippled by the manacles of segregation and the chains of discrimination one hundred years later the negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity one hundred years later the negro is still languishing in the comers of american society and finds himself in exile in his own land we all have come to this hallowed spot to remind america of the fierce urgency of now 

In [22]:
cleaned_words = cleaned.split()

In [None]:
cleaned_words

In [24]:
import pandas as pd

In [25]:
# This looks at every single value, groups them, then counts them

df = pd.DataFrame(cleaned_words).value_counts()
df

0      
the        54
of         49
to         29
and        27
a          20
           ..
jews        1
joyous      1
judged      1
land        1
lookout     1
Name: count, Length: 323, dtype: int64

In [28]:
# Step 1: Turn the list of cleaned words into a DataFrame with a single column named 'word'
df = pd.DataFrame(cleaned_words, columns=['word'])

# Step 2: Count how many times each word appears using value_counts()
#         This returns a Series with words as index and counts as values
df = df.value_counts().reset_index(name='count')

# Step 3: Rename the columns to 'word' and 'count' for clarity
df.columns = ['word', 'count']

# Step 4: Print the first 10 rows to see the most common words
print(df.head(10))

      word  count
0      the     54
1       of     49
2       to     29
3      and     27
4        a     20
5       in     17
6       be     16
7     will     16
8  freedom     13
9       we     13


In [29]:
df.to_csv(r'C:\Users\jrwie\OneDrive\Desktop\Data Stuffs\Analyst_Builder\Python\Exports\MLKjr_speech_scraped.csv')