# Table of Contents
 <p>

In [None]:
### Part 1 - Scraping The Federalist Papers ###

import requests
from bs4 import BeautifulSoup
import os
import time

os.chdir("Downloads/Project1/documents")

for i in range(85):
    print(i+1)
    if i < 9:
        ID = "0" + str(i+1)
    else:
        ID = str(i+1)
    r = requests.get("http://avalon.law.yale.edu/18th_century/fed%s.asp" % ID)
    s = BeautifulSoup(r.text)
    ps = s.findAll("p")
    with open(ID + ".txt", "w") as f:
        for p in ps:
            t = p.text
            t = t.replace(" Return to the Text", "")
            t = t.replace("Ã\x95", "")
            f.write(t + "\n")
    time.sleep(30)


author = []
r = requests.get("https://www.congress.gov/resources/display/content/The+Federalist+Papers")
s = BeautifulSoup(r.text)
table = s.find("table", {"class": "confluenceTable"})
rows = table.findAll("tr")
for row in rows[1:]:
    author.append(row.findAll("td")[2].text)

In [None]:
### Part 2 - Mimicking the three Federalist authors ###


import nltk
import numpy

nltk.download('punkt')

def draw_word(distrn):
    words = list(distrn)
    freqs = [freq for w, freq in distrn.items()]
    total = sum(freqs)
    probs = [freq/total for freq in freqs]
    return numpy.random.choice(words, p=probs)

def generate_with_trigrams(text, word=None, num=100):
    tokens = nltk.tokenize.word_tokenize(text)
    trigrams = nltk.trigrams(tokens)
    condition_pairs = (((w0, w1), w2) for w0, w1, w2 in trigrams)
    cfdist = nltk.ConditionalFreqDist(condition_pairs)
    if word is None:
        prev = draw_word(nltk.FreqDist(tokens))
        word = draw_word(nltk.ConditionalFreqDist(nltk.bigrams(tokens))[prev])
    elif len(word.split()) == 1:
        prev = word
        word = draw_word(nltk.ConditionalFreqDist(nltk.bigrams(tokens))[prev])
        # will give an error if this pair doesn't show up in the text
    else:
        prev, word = word.split()[:2]
    print(prev, end=' ')
    for i in range(1, num):
        print(word, end=' ')
        prev, word = word, draw_word(cfdist[(prev, word)])


# each author will have all his papers merged
hamilton = ""
madison = ""
jay = ""

docnames = [f for f in os.listdir() if f[-4:]==".txt"]
docnames.sort()

N = len(docnames)
for i in range(N):
    with open(docnames[i], 'r') as f:
        if author[i] == "Hamilton":
            hamilton += f.read() + " "
        elif author[i] == "Madison":
            madison += f.read() + " "
        elif author[i] == "Jay":
            jay += f.read() + " "

len(hamilton)
len(madison)
len(jay)


generate_with_trigrams(hamilton, "The")
generate_with_trigrams(madison, "The")
generate_with_trigrams(jay, "The")

In [None]:
### Part 3 - Creating data frame of token frequencies ###

import pandas as pd
import re

N = len(docnames)
tables = [None]*N
for i in range(N):
    with open(docnames[i], 'r') as f:
        doc = f.read()
        doc = doc.replace("To the People of the State of New York:", "")
        doc = doc.replace("PUBLIUS", "")
        doc = doc.replace("Ã¥", "")
        doc = re.sub("[0-9]+", "", doc)
        doc = doc.lower()
        tokens = nltk.tokenize.word_tokenize(doc)
        tables[i] = nltk.FreqDist(tokens)

df = pd.DataFrame(tables)

# fill in zeros
df.fillna(0, inplace=True)

# divide rows by totals
for i in range(N):
    s = sum(df.iloc[i])
    df.iloc[i] = [n/s for n in df.iloc[i]]

df.iloc[0] # check a row to make sure it worked

# write as csv
df.to_csv("../federalist.csv")












# write authors as well
with open("../authors.csv", "w") as f:
    f.write("author\n")
    for a in author:
        f.write(a + "\n")

# and write tokens
with open("../tokens.csv", "w") as f:
    f.write("token\n")
    for t in list(df):
        f.write('"' + t + '"\n')