# Workshop 1
#### Steven Jiang
#### October 11, 2018

# Scrape Last Statements Text

In [None]:
import requests
from bs4 import BeautifulSoup
from nltk import sent_tokenize

statement_text = []

death_row = requests.get("http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html")
soup = BeautifulSoup(death_row.text,"html.parser")
table = soup.find('table')
rows = table.find_all('tr')
for row in rows[1:]:
    cols = row.find_all('td')
    link = cols[2].find('a').get('href')
    if "no_last_statement" not in link:
        try:
            link = "http://www.tdcj.state.tx.us/death_row/" + link
            statement_page = requests.get(link)
            statement_soup = BeautifulSoup(statement_page.text,"html.parser")
            text = statement_soup.find_all('p')[-1].text
            if text != "No statement." and len(text) > 0:
                statement_text += sent_tokenize(text)
        except:
            pass

# Parse Hard West Turn Text

In [26]:
with open("hard_west_turn.txt", 'r') as f:
    hwt_text = sent_tokenize(f.read().decode("utf-8").strip().lower())

# Save and Load Text

In [27]:
import pickle
with open('last_statements_sents.pkl', 'wb') as ls:
    pickle.dump(statement_text, ls)
with open('hwt_text.pkl', 'wb') as hwt:
    pickle.dump(hwt_text, hwt)
with open('all_text.pkl', 'wb') as all_txt_file:
    all_txt = statement_text+hwt_text
    pickle.dump(all_txt, all_txt_file)

In [10]:
import pickle
with open('last_statements_sents.pkl', 'rb') as ls:
    statement_text = pickle.load(ls)
with open('hwt_text.pkl', 'rb') as hwt:
    hwt_text = pickle.load(hwt)
with open('all_text.pkl', 'rb') as all_txt_file:
    all_text = pickle.load(all_txt_file)

# Generate N-Grams

In [11]:
import numpy as np
import sys
import math

# Get counts of individual words
def gen_unigram_counts(unigram_count, word_list, i):
    if word_list[i] in unigram_count.keys():
        unigram_count[word_list[i]] += 1
    else:
        unigram_count[word_list[i]] = 1

# Get counts of bigrams
def gen_bigram_counts(bigram_count, word_list, i):
    # Edge case for first word
    if i == 0:
        bigram_key = (word_list[i], "#")
    else:
        bigram_key = (word_list[i], word_list[i-1])
    if bigram_key in bigram_count.keys():
        bigram_count[bigram_key] += 1
    else:
        bigram_count[bigram_key] = 1

# Get counts of trigrams
def gen_trigram_counts(trigram_count, word_list, i):
    # Edge cases for first and second word
    if i == 0:
        trigram_key = (word_list[i], "#", "#")
    elif i == 1:
        trigram_key = (word_list[i], word_list[i-1], "#")
    else:
        trigram_key = (word_list[i], word_list[i-1], word_list[i-2])
    if trigram_key in trigram_count.keys():
        trigram_count[trigram_key] += 1
    else:
        trigram_count[trigram_key] = 1

# Generate bigram and trigram probability dictionaries
def gen_probs(raw_text):
    # Keep count for each word
    unigram_count = {}
    bigram_count = {}
    trigram_count = {}
    # Initialize the count for the start keys to be 0
    unigram_count["#"] = 0
    bigram_count[("#", "#")] = 0
    # Iterate over lines in file
    for sent in raw_text:
        # Get list of words
        word_list = sent.split()
        # Increment the count for the word
        unigram_count["#"] += 1
        bigram_count[("#", "#")] += 1
        for i in range(len(word_list)):
            gen_unigram_counts(unigram_count, word_list, i)
            gen_bigram_counts(bigram_count, word_list, i)
            gen_trigram_counts(trigram_count, word_list, i)
        # Add stop symbol into dictionary
        last_bigram_key = ("#",word_list[-1])
        if last_bigram_key in bigram_count.keys():
            bigram_count[last_bigram_key] += 1
        else:
            bigram_count[last_bigram_key] = 1
        if len(word_list) >1:
            last_trigram_key = ("#",word_list[-1], word_list[-2])
        else:
            last_trigram_key = ("#",word_list[-1], "#")
        if last_trigram_key in trigram_count.keys():
            trigram_count[last_trigram_key] += 1
        else:
            trigram_count[last_trigram_key] = 1
    # Calculate probabilities
    trigram_prob = {}
    for key in trigram_count.iterkeys():
        word1 = key[1]
        word2 = key[2]
        trigram_prob[key] = trigram_count[key]/float(bigram_count[(word1, word2)])
    return trigram_prob

# Generate a word with the trigram model
def gen_sent_trigram(trigram_prob):
    # Begin with the start symbol
    sent = "#"
    # Keep track of last two
    start_key = ""
    prev_key = ""
    # Randomly choose words until we reach a stop symbol
    num_words = 0
    while start_key != "#":
        num_words += 1
        if num_words > 20:
            return None
        if start_key == "" and prev_key == "":
            start_key = "#"
            prev_key = "#"
        keys = []
        for key in trigram_prob.iterkeys():
            if (key[1], key[2]) == (start_key, prev_key):
                keys.append(key)
        choices = []
        probs = []
        for c in keys:
            choices.append(c[0])
            probs.append(trigram_prob[c])
        prev_key = start_key
        start_key = np.random.choice(choices, p=probs)
        sent = sent + " " + start_key

    return sent[2:-1]

# Function for generating sentences
def gen_sents(prob, n):
    sentences = []
    for i in range(n):
        sent = gen_sent_trigram(prob)
        if sent:
            sentences.append(sent)
    return sentences


# Save and Load Probabilities

In [11]:
import pickle

ls_trigram = gen_probs(statement_text)
with open('ls_trigram.pkl', 'wb') as ls:
    pickle.dump(ls_trigram, ls)

hwt_trigram = gen_probs(hwt_text)
with open('hwt_trigram.pkl', 'wb') as hwt:
    pickle.dump(hwt_trigram, hwt)

all_trigram = gen_probs(all_text)
with open('all_trigram.pkl', 'wb') as all_tri:
    pickle.dump(all_trigram, all_tri)

In [12]:
import pickle
with open('ls_trigram.pkl', 'rb') as ls:
    ls_trigram = pickle.load(ls)
with open('hwt_trigram.pkl', 'rb') as hwt:
    hwt_trigram = pickle.load(hwt)
with open('all_trigram.pkl', 'rb') as all_tri:
    all_trigram = pickle.load(all_tri)

# Generate Sentences Using Trigram Probabilities

In [13]:
ls_sentences = gen_sents(ls_trigram, 15)
hwt_sentences = gen_sents(hwt_trigram, 15)
all_sentences = gen_sents(all_trigram, 15)

In [14]:
from time import sleep 
import random

def print_sents(sentences):
    for sent in sentences:
        for c in sent:
            sys.stdout.write(c)
            sleep(random.uniform(0, 0.25))
        print
        sleep(random.uniform(0, 1))

# Print Sentences

In [15]:
print_sents(ls_sentences)

June 25, 2008. 
Kathy, y'all take and y'all look after Aleda and make witnesses say what they have allowed me to be. 
Yes sir. 
I hold nothing against no one else was. 
If I could ask for. 
Be strong, brother. 
Get in church and get in church and get that for myself. 
All of you all, all of my famous legendary brother, Matt Turner, "Y'all kiss my black ass." 
Please understand. 
To my loved ones and my family, grandson, friends. 
Amen. 
When I kill one or pop one, ya'll want to thank you for being here. 


In [16]:
print_sents(hwt_sentences)

when a bullet is the only part of the man. 
victims were found in different areas. 
recently. 
the man of course knew about. 
in sociology, in sociology. 
the man knew what he knew and had. 
there are many risk factors do not like being left­ handed. 
41 this man may never have dreamed. 
of the outdoor west entrance steps, placing them on their own sex, as well as firearms. 
it is illegal for him to decide to leave the library. 
the city was supposed to have launched over a very serious mental illness. 


In [17]:
print_sents(all_sentences)

the two­page letter was handwritten. 
You know that I am sorry that I am ready to begin with. 
the man thought to himself a good deal. 
this man was given to thinking of events of national importance. 
this includes taking away guns. 
many counties have laws that protect homosexuals from violence and discrimination. 
the man had many thoughts, few of them are over 200 metres tall, which is a list of friends. 
additionally, additionally, additionally, additionally, additionally, additionally, additionally. 
in the shooting, while six others received wounds requiring hospitalization. 
Lubbock County officials believe I could think of anything else. 
they may be voted upon by the courts. 
by crashing their planes, by crashing their planes, they would kill themselves as a nine. 
It's all good, it's been a nice person. 
I would like to tell you. 
Hector, you too. 


# Motivation

## Last Statements

The Last Statements corpus is a collection of recorded last statements from prisoners on Death Row. The data is publicly available on the website of the Texas Department of Justice. While I originally had some ethical considerations when deciding whether or not to use this data, I convinced myself that my motivation for doing so was moral. I decided not to use a deep learning approach for text generation because I wanted the output to be intelligible and mirror the style of the text in the corpus. As a result, I decided to take an n-grams approach. The main motivation behind using this corpus is to analyze the general sentiment of these last statements and, perhaps, gain insight into the thoughts, concerns, and qualms going through these peoples' minds. 

## A Hard West Turn

A Hard West Turn is a computationally generated book by Nick Montfort that is directly based on incidents of violence in recent American history. Using a computationally generated book as a corpus for computational text generation adds a layer of abstraction. I think it raises an interesting question regarding the ownership of the work. 

## Combining Last Statements and A Hard West Turn

I'm somewhat hesistant to make assumptions about how these two corpora are related. However, my motivation for combining these texts was to juxtapose the emotions and thoughts of those affected by violence in America, from both sides of the equation. I was hoping to create a poem that displayed a mixture of emotions; I think the resulting poem succeeded in doing just that.
