# PREPROCESSING


In [0]:
#Import libs
import pickle
import nltk
nltk.download('punkt')

import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/f2/64/a1df4440483df47381bbbf6a03119ef66515cf2e1a766d9369811575454b/pyspark-2.4.1.tar.gz (215.7MB)
[K    100% |████████████████████████████████| 215.7MB 97kB/s 
[?25hCollecting py4j==0.10.7 (from pyspark)
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K    100% |████████████████████████████████| 204kB 29.4MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/47/9b/57/7984bf19763749a13eece44c3174adb6ae4bc95b920375ff50
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.1


In [0]:
with open('speeches.pkl', 'rb') as file:
    data = pickle.load(file)

FileNotFoundError: ignored

In [0]:
data

# 6A and 6B


Using NLTK's Punkt sentence tokenizer, we are using their "pretrained unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences." This satisfies the nature of the part (a) 

In [0]:
'''Helper functions along with the transform() function which returns a list of 
tuples sorted by year. Each tuple contains the SOU year, Sentence Count, and 
Mean Sentence Length'''

'''sentencing_data() returns a list of sentences for a given body of text. 
Iterating through each row in the original data file, we produce a list of 
sentences for each SOU'''

def sentencing_data(str):
  d = str.replace('\r\n\r\n', ' ') #cleans str
  sent_list = nltk.tokenize.sent_tokenize(d)
  return sent_list

def SentLen(str):
    spl = str.split(' ')  #finding the number of words using split()
    return len(spl)
  
def avgSentLen(list):
    sum = 0
    for i in range(0, len(list)):
      sum += SentLen(list[i])
    return sum/len(list)

def totalWords(list):
  sum = 0
  for i in range(0, len(list)):
    sum += SentLen(list[i])
  return sum

def transform(data):
    tup_List = []
    for i in range(0, len(data)):
        sentData = sentencing_data(data[i][1])
        year = int(data[i][2])
        numSent = len(sentData)
        pres = data[i][0] #find name of President
        tup_List.append((year, pres, numSent, round(avgSentLen(sentData),2), totalWords(sentData)))

    return sorted(tup_List, key=lambda tup: tup[0])

In [0]:
a = transform(data)
print(a)


In [0]:
#Decided to go the Pandas df route
df = pd.DataFrame(a, columns=['Year', 'President', 'Sentences Count', 'Avg Length', 'Total Words'])
print(df)

In [0]:
x = np.array(df['Year'])
Sent_Count= np.array(df['Sentences Count'])
avg_length= np.array(df['Avg Length'])


def rplot(X,Y,col):
    M = np.vstack([X,np.ones(len(X))]).T
    m, b = np.linalg.lstsq(M,Y)[0]
    
    plt.figure()
    plt.plot(X, Y,'+', c = '0.5')
    F = m*X+b
    plt.plot(X, F,'b', label = col)
    plt.title(col)
    
    plt.show()
    print('Slope: ',round(m,4) , '  Intercept: ', round(b,4)) # print slope and intercept below plot
    
rplot(x,Sent_Count,"Sentences Count")
rplot(x,avg_length,"Avg Length")
    

From the plots, we observe that the number of sentences in SOU addresses have increased over time (m = .7855). In addition, the average sentence length has decreased over time (m = -.1192). An intuitive explanation for this trend is that Presidents (and their speech-writers) have incorporated brevity into the speeches as the years have gone by. Instead of possibly having many run-on sentences that have more than 30 words, we see many presidents after 1950 havng a mean sentence length under 25 words. 

# **6C**

In [0]:
df1 = df[:123]
df2 = df[123:]


In [0]:
yr_pre1912 = np.array(df1['Year'])
tw_pre1912 = np.array(df1['Total Words'])

yr_post1912 = np.array(df2['Year'])
tw_post1912 = np.array(df2['Total Words'])

In [0]:
rplot(yr_pre1912, tw_pre1912,"Pre-1912")
rplot(yr_post1912, tw_post1912,"Post-1912")
    

Prior to 1912, we see an increase in total words (i.e. an increase of 141 words per unit increase in year via our regression).

After 1912, we see no significant change in increase of total words.

In 1913, Woodrow Wilson brought back in-person delivery. Due to time constraints and speech-writers' preferences, this may explain the lack of substantial increase in words.

(Source: https://history.house.gov/Institution/SOTU/List/)

# 6D

To find which President has the longest sentences on average, we must consider every SOU that the President delivered. Thus, we will use a dictionary to store the *total average sentence length* , i.e. the aggregate total words divided by the aggregate number of sentences

In [0]:
d = {}

for i in df['President'].unique():
    totSent = 0
    totWords = 0
    for j in df[df['President']==i].index:
        totWords += df['Total Words'][j]
        totSent += df['Sentences Count'][j]
    totavgSent = round((totWords/totSent), 2)
    d[i] = totavgSent

d

In [0]:
max_pres = max(d, key=d.get)
min_pres = min(d, key=d.get)

print("The President with the longest sentences on average:   ", max_pres)
print("The President with the shortest sentences on average:   ", min_pres)

In [0]:
#converts dict into list of lists, sorted by avg sent length

temp = []
dlist = []
for key, value in d.items():
    temp = [key,value]
    dlist.append(temp)

from operator import itemgetter
final = sorted(dlist, key=itemgetter(1))
final



In [0]:
import math
#25th percentile

print("The median president is:  ", final[20][0])
q1 = math.ceil(.25 * len(final))
q2 = math.ceil(.75 * len(final))
print("The 25th percentile is:  ", final[q1-1][0])
print("The 75th percentile is:  ", final[q2-1][0])

**Median**: Benjamin Harrison

The 25th percentile is:   Franklin D. Roosevelt

The 75th percentile is:   Zachary Taylor


In [0]:
# sentencing_data defined above in cell with all functions
#intialize 
max_sent = 44  # based off longest avg sent length from Madison
min_sent = 15
smin = ''
smax = ''
for i in range(0, len(data)):
    sentData = sentencing_data(data[i][1])
    for j in range(0, len(sentData)):
        if SentLen(sentData[j]) < min_sent and SentLen(sentData[j]) > 2 :   #We set this as 2 because there are many names with abbreviations which count as sentences
            min_sent = SentLen(sentData[j])
            smin = sentData[j]
        if SentLen(sentData[j]) > max_sent:
            max_sent = SentLen(sentData[j])
            smax = sentData[j]


print('The longest sentence:   ',smax)

print('The shortest sentence:  ', smin)

                

In [0]:
def clean_and_split(s):
  # encode to UTF-8, convert to lowercase and translate all hyphens and
  # punctuation to whitespace
  s = s.encode('utf-8').lower().replace('-',' ').translate(None, string.punctuation)
  # replace \r\n
  s = re.sub('(\r\n)+',' ', s)
  # replace whitespace substrings with one whitespace and remove
  # leading/trailing whitespaces
  s = re.sub(' +',' ',s.strip())
  return s.split(' ')
