# Hist 3368 - Week 10: Measuring Distinctiveness with Log Likelihood 

# For teaching purposes

#### Demonstrating word vectors

In [11]:
mystring =  ["Peter Piper picked a peck of pickled peppers. A peck of pickled peppers Peter Piper picked. If Peter Piper picked a peck of pickled peppers, Where's the peck of pickled peppers Peter Piper picked?"]

In [22]:
import numpy as np, pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words = None)

vectors = vectorizer.fit_transform(mystring)
print(vectorizer.get_feature_names())
print(vectors.toarray())

['if', 'of', 'peck', 'peppers', 'peter', 'picked', 'pickled', 'piper', 'the', 'where']
[[1 4 4 4 4 4 4 4 1 1]]


## Summary of Notebook

#### Import Software

In [None]:
#import some software
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
import adjustText
import matplotlib

#### Load some Data

In [None]:
cd /scratch/group/history/hist_3368-jguldi

In [None]:
congress = pd.read_csv("congress1967-2010.csv")
#congress = pd.read_csv("eighties_data.csv")

#### Cleaning

In [None]:
# clean up congress
congress = congress[congress['year'] == 1968]
clean_congress = congress.copy()
clean_congress['speech'] = clean_congress['speech'].str.replace('[^\w\s]','') # remove punctuation
clean_congress['speech'] = clean_congress['speech'].str.replace('\d+', '') # for digits
clean_congress['speech'] = clean_congress['speech'].str.replace(r'(\b\w{1}\b)', '') # for short words

In [None]:
congress[:5]

In [None]:
clean_congress[:5]

#### Format Data Around Units of Interest With One String per Unit

In [None]:
# format the data around our research into speakers
top_speakers = clean_congress.groupby('speaker').agg({'speech': ' '.join, 'word_count': 'sum'}).sort_values('word_count', ascending = False)[:10]
top_speakers = top_speakers[top_speakers.index != 'The PRESIDING OFFICER']

In [None]:
top_speakers

#### Make Word Vectors -- One for Each Unit of Interest

In [None]:
# make a word vector and get some information from it
vectorizer = CountVectorizer(
    max_features=10000, 
    lowercase=True, 
    stop_words = 'english',
    ngram_range=(1, 2), 
    analyzer = "word",
    #tokenizer=LemmaTokenizer()
    )

vectorized = vectorizer.fit_transform(top_speakers['speech'])
vectorized

#### Make the Word Vectors Readable

In [3]:
all_words = np.array(vectorizer.get_feature_names())
speaker_names = list(top_speakers.axes[0].to_numpy())

vectors_dataframe = pd.DataFrame(vectorized.todense(), # the matrix we saw above is turned into a dataframe
                                 columns=all_words,
                                 index = speaker_names
                                 )
vectors_dataframe

NameError: name 'np' is not defined

#### Compute some baseline numbers about the data

In [None]:
speaker_words_total = vectors_dataframe.sum(axis=1)
word_totals = vectors_dataframe.sum(axis=0) 
total_corpus_words = sum(word_totals)

#### Measure distinctiveness with log likelihood

In [None]:
## create an empty dataframe
speakers_loglikelihood = []

## loop through every speaker in speaker_names
for speaker_id, speaker in enumerate(speaker_names):
    loglikely = []
    # loop through every word in the wordcount vector:
    for word_id in vectorized[speaker_id].indices:
        
        a = vectors_dataframe.iat[speaker_id, word_id] #  word in speaker
        b = word_totals[word_id] - a  # # word in remaining speakers
        c = speaker_words_total[speaker_id] - a #  not word in speaker
        d = total_corpus_words - a - b - c # not word in remaining speakers
       
        E1 = (a + c) * (a + b) / total_corpus_words  
        E2 = (b + d) * (a + b) / total_corpus_words 
        
        LL = 2 * (a * np.log(a / E1)) # the log likelihood equation
        if (b > 0):
            LL += 2 * b * np.log(b / E2)
        
        loglikely.append((LL, all_words[word_id])) # add the log likelihood score to the end of a new dataframe

    loglikely = sorted(loglikely, reverse=True) # the loop hits this every time it cycles through all the words in one speaker. 
    speakers_loglikelihood.append(loglikely) # add on another speaker


In [None]:
print("The 20 most distinctive words of each speaker. The words are listed from high to low ranking")
print("-------------------------------------------\n")
for i, speaker in enumerate(speaker_names):
    print(speaker + ": ", end='')
    distinct_words = [word[1] for word in speakers_loglikelihood[i][:20]]
    print(distinct_words)
    print("\n-----------------------------\n")

## Visualizing the distinctiveness of the language of each speaker

Note that we're going to make a big plot.

***This might take a minute or two to run.***

In [None]:
cd ~/digital-history

In [None]:
%matplotlib inline
from adjustText import adjust_text

# change the figure's size here
plt.figure(figsize=(10,10), dpi = 500)

# style
plt.style.use('seaborn-darkgrid') # this gives us a grid with a dark background.  you can play with this to change the style.
  
# create a color palette
palette = plt.get_cmap('hsv') # this tells matplotlib what colors to use.  you can play with this to change the colors.

# start a counter at 0
num = 0

# create an empty list
texts = []

# this is the for loop that creates multiple plots.  
for i, speaker in enumerate(speaker_names):
        num += 14 # num tells the plot to choose a different color for each speaker
        distinct_words = speakers_loglikelihood[i][:20] # plot the top twenty words by LL-score
        for word in distinct_words: # for each word-per-speaker instance, plot the ll_score on the y axis
            ll_score = word[0] # find just the ll-score from speakers_loglikelihood
            word_label = word[1] # find just the keyword name from speakers_loglikelihood
            plt.scatter( # draw a dot for each word
                    speaker, # with speaker on the x axis
                    ll_score, # and ll_score on the y axis
                    color=palette(num), # using a different color for each speaker
                    linewidth=1, 
                    edgecolors = 'b',
                    s = 55, # size
                    alpha=0.3, # make the dots slightly transparent
                    label=speaker) # label each dot with the name of the word
            texts.append(plt.text(speaker, ll_score, word_label)) # save these coordinates to be used in labeling

# Add legend
plt.xticks(rotation=90)
 
# Add titles
plt.title("Figure 1: Highest Log-Likelihood Scored Words per Speaker", fontsize=30, fontweight=0, color='Red')
plt.xlabel("Speaker")
plt.ylabel("Distinctiveness of Words, Measured by LL Score")

# Code to help with overlapping labels -- may take a minute to run
adjust_text(texts, force_points=0.2, force_text=0.2,
            expand_points=(1, 1), expand_text=(1, 1),
            arrowprops=dict(arrowstyle="-", color='black', lw=0.5))

# save it
fig1 = plt.gcf()
plt.show()
plt.draw()
fig1.savefig('LL-fig1.jpg', dpi=500)

### A 2-D comparison of two speakers

#### Create a dataframe from just two speakers

 ***The x coordinate will be how distinctive each word is for Javits; the y coordinate will be how distinctive each word is for Long.  Change the speakers in question by changingn in speakers_loglikelihood[n].***

In [None]:
# create a new dataframe of the scores and words from both Javits and Long
xcoords =  pd.DataFrame(columns=list(['word_label', 'x_llscore']))
ycoords = pd.DataFrame(columns=list(['word_label', 'y_llscore']))

# get all the words from JAVITS
distinctwords = speakers_loglikelihood[0]
for word in distinctwords: # for each word-per-cspeaker instance, plot the ll_score on the y axis
    x_llscore = word[0] # find just the ll-score from speakers_loglikelihood
    word_label = word[1] # find just the keyword name from speakers_loglikelihood
    speaker = 'JAVITS'
    xcoords.loc[-1] = [word_label, x_llscore]
    xcoords.index = xcoords.index + 1  # shifting index
    xcoords = xcoords.sort_index()  # sorting by index

# get all the words from LONG
distinctwords2 = speakers_loglikelihood[1] 
for word in distinctwords2: # for each word-per-speaker instance, plot the ll_score on the y axis
    y_llscore = word[0] # find just the ll-score from speakers_loglikelihood
    word_label = word[1] # find just the keyword name from speakers_loglikelihood
    speaker = 'LONG'
    ycoords.loc[-1] = [word_label, y_llscore]
    ycoords.index = ycoords.index + 1  # shifting index
    ycoords = ycoords.sort_index()  # sorting by index

coords = pd.merge(xcoords, ycoords, on='word_label')
coords = coords.dropna(axis = 0, how ='any') # drop rows with any NA's

In [None]:
#!pip install adjustText --user

In [None]:
%matplotlib inline

from adjustText import adjust_text

# change the figure's size here
plt.figure(figsize=(10,10), dpi = 500)

# label each dot with the name of the word -- note that we have to use a "for" loop for this to work; plt.annotate only plots
# one label per iteration!
for i, txt in enumerate(coords['word_label']):
    # draw a dot for each word
    plt.scatter( 
        coords['x_llscore'][i], #x axis
        coords['y_llscore'][i], # y axis
        linewidth=1, 
        s = 55, # dot size
        alpha=0.2)  # dot transparency
    # make a label for each word
    plt.annotate(
        txt, 
        (coords['x_llscore'][i], # one x llscore at a time
         coords['y_llscore'][i]), # one y llscore at a time
        alpha=0.3 # i've made the fonts transparent as well.  you could play with color and size if you wanted to. 
    )

plt.xticks(rotation=90)

# logarithmic axes make big things big and small things small
plt.xscale('log')
plt.yscale('log')  

# Add titles
plt.title("Figure 2: Highest Log-Likelihood Scored Words per Speaker", fontsize=30, fontweight=0, color='Red')
plt.xlabel("How Distinctive Each Word is of Mr. Javits")
plt.ylabel("How Distinctive Each Word is of Mr. Long")


# save it
fig1 = plt.gcf()
plt.show()
plt.draw()
fig1.savefig('LL-fig2.jpg', dpi=500)

# From Data to Analysis

In [None]:
cd /scratch/group/history/hist_3368-jguldi

In [None]:
congress = pd.read_csv("congress1967-2010.csv")
congress = congress[congress['year'] == 1968]
top_speakers_speeches = congress[congress['speaker'].isin(top_speakers.index)]

#### Setup: Make a KWIC Dictionary for a Single Speaker

In [None]:
import string    
long_speeches = top_speakers_speeches[top_speakers_speeches['speaker'] == "Mr. LONG of Louisiana"]['speech']
long_speeches = ' '.join(long_speeches).lower() # join back together and lowercase
long_speeches = long_speeches.translate(str.maketrans('', '', string.punctuation)) # remove punctuation

Add some functions for defining ngrams 

In [None]:
def getNGrams(wordlist, n):
    ngrams = []
    for i in range(len(wordlist)-(n-1)):
        ngrams.append(wordlist[i:i+n])
    return ngrams

def nGramsToKWICDict(ngrams):
    keyindex = len(ngrams[0]) // 2

    kwicdict = {}

    for k in ngrams:
        if k[keyindex] not in kwicdict:
            kwicdict[k[keyindex]] = [k]
        else:
            kwicdict[k[keyindex]].append(k)
    return kwicdict


def prettyPrintKWIC(kwic):
    n = len(kwic)
    keyindex = n // 2
    width = 20

    outstring = ' '.join(kwic[:keyindex]).rjust(width*keyindex)
    outstring += str(kwic[keyindex]).center(len(kwic[keyindex])+6)
    outstring += ' '.join(kwic[(keyindex+1):])

    return outstring



In [None]:
fullwordlist = long_speeches.split()
ngrams = getNGrams(fullwordlist, 20)
worddict = nGramsToKWICDict(ngrams)

#### Doing Research With Log Likelihood and KWIC

In [None]:
speakers_loglikelihood[1][:5]

In [None]:
# output KWIC for target word
target = 'gun'
outstr = '<pre>'
if target in worddict:
    for k in worddict[target]:
        outstr += prettyPrintKWIC(k)
        outstr += '             '
else:
    outstr += 'Keyword not found in source'

outstr += '</pre>'
outstr