In [8]:
import pandas as pd
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from modules.cleaners import clean_text
import string
pd.options.mode.chained_assignment = None

In [9]:
#create dataframe with year, president and party columns
df = pd.read_csv('US presidents.csv', delimiter=';')
#rename column for easier reading
df.rename(columns={'Years (after inauguration)': 'year'}, inplace=True)

In [10]:
#make a series with the filenames
path = 'speeches/'
files = pd.Series(sorted(os.listdir(path))).map(lambda x: x[:-4])

In [11]:
#method that returns dict with key: year (int) value: speech (str)
def get_text_dict():
    txtdict = {}

    for file in files:
        file_ = path + file + '.txt'
        f = open(file_, encoding='utf8', errors='ignore')
        filetxt = f.read()
        txtdict[int(file)] = filetxt

    return txtdict

In [12]:
text = get_text_dict()

In [13]:
#add series to df with speeches from dict values
speech_list = text.values()
sp = pd.Series(speech_list)
df['speech'] = sp

In [14]:
#check how dataframe looks
dataTypeSeries = df.dtypes
print(dataTypeSeries)

year          int64
President    object
Party        object
speech       object
dtype: object


In [15]:
def clean_text(speech):
    tokens = word_tokenize(speech)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words

In [16]:
#runs cleaning method from modules.cleaner on each speech string
#Running this takes very long time, so a subset of the first 5  has been selected
pd.options.mode.chained_assignment = None
df2 = df[:5]
#method cleant_text takes str as arg, fals if df object
df2['speech'] = df2['speech'].astype(str)
df2['speech'] = df2['speech'].map(lambda x: clean_text(x))
#df['speech'] = df.apply(lambda x: clean_text(x['speech']), axis=1)
print(df2)

   year           President       Party  \
0  1900    William McKinley  Republican   
1  1901  Theodore Roosevelt  Republican   
2  1902  Theodore Roosevelt  Republican   
3  1903  Theodore Roosevelt  Republican   
4  1904  Theodore Roosevelt  Republican   

                                              speech  
0  [dictvalues, senate, house, representatives, n...  
1  [dictvalues, senate, house, representatives, n...  
2  [dictvalues, senate, house, representatives, n...  
3  [dictvalues, senate, house, representatives, n...  
4  [dictvalues, senate, house, representatives, n...  


In [24]:
pd.options.display.max_colwidth = 3000
df2.iloc[0,3]

['dictvalues',
 'senate',
 'house',
 'representatives',
 'nat',
 'outgoing',
 'old',
 'incoming',
 'new',
 'century',
 'begin',
 'last',
 'session',
 'fiftysixth',
 'congress',
 'evidences',
 'every',
 'hand',
 'individual',
 'national',
 'prosperity',
 'proof',
 'growing',
 'strength',
 'increasing',
 'power',
 'good',
 'republican',
 'institutions',
 'countrymen',
 'join',
 'felicitation',
 'american',
 'liberty',
 'firmly',
 'established',
 'ever',
 'love',
 'determination',
 'preserve',
 'universal',
 'former',
 'period',
 'history',
 'nthe',
 'republic',
 'never',
 'strong',
 'never',
 'strongly',
 'entrenched',
 'hearts',
 'people',
 'constitution',
 'amendments',
 'exists',
 'left',
 'hands',
 'authors',
 'additions',
 'made',
 'proclaim',
 'larger',
 'freedom',
 'extended',
 'citizenship',
 'popular',
 'government',
 'demonstrated',
 'one',
 'hundred',
 'twentyfour',
 'years',
 'trial',
 'stability',
 'security',
 'efficiency',
 'best',
 'instrument',
 'national',
 'development