In [None]:
import math
import pandas as pd
import re
from IPython.display import display, HTML
import math
import graphviz

In [None]:
def calculate_keyness(fdist1, fdist2, fthreshold=5, keyness_threshold=6.6, top=100, print_table=True):
    '''create a keyness comparison table from two frequency lists
    '''
    
    c1size = sum(fdist1.values())
    c2size = sum(fdist2.values())

    
    kdata = []
    
    for item, freq in fdist1.items():
        if freq<fthreshold:
            continue
            
        ref_freq = fdist2.get(item,0)
        
        if ref_freq<fthreshold:
            continue
        
        
        keyness = log_likelihood(freq, c1size, ref_freq, c2size)
        
        row = {'item': item, 'freq': freq, 'ref_freq': ref_freq, 'keyness': keyness}
        
        if keyness>keyness_threshold:
        
            kdata.append(row)
        
    
    kdf = pd.DataFrame(kdata)[['item', 'freq', 'ref_freq', 'keyness']]
    
    kdf=kdf.sort_values('keyness', ascending=False)
    
    if not print_table:
        return kdf[:top]
    
    template = "{: <25}{: <10}{: <10}{:0.3f}"
    
    header = "{: <25}{: <10}{: <10}{}".format('WORD', 'A Freq.', 'B Freq.', 'Keyness')
    
    print("{}\n{}".format(header, "="*len(header)))
    
    for item, freq, ref_freq, keyness in kdf[:top].values:
        print(template.format(item, freq, ref_freq, keyness))

In [None]:
def log_likelihood(item_A_freq, corpus_A_size, item_B_freq, corpus_B_size):
    '''calculate the log likelihood score for a comparison between the frequency of two items
    '''
    E1 = corpus_A_size*(item_A_freq+item_B_freq) / (corpus_A_size+corpus_B_size)
    E2 = corpus_B_size*(item_A_freq+item_B_freq) / (corpus_A_size+corpus_B_size)

    G2 = 2*((item_A_freq*math.log(item_A_freq/E1)) + (item_B_freq*math.log(item_B_freq/E2)))
    
    sign = 1 if (item_A_freq / corpus_A_size) >= (item_B_freq / corpus_B_size) else -1
    
    return sign*G2

In [None]:
def plot_keyitems(df, num=10, c1='red', c2='blue', corpusA='corpus A', corpusB='corpus B'):
    '''create a horizontal bar plot of top/bottom N items in a keyness table
    
    Args:
        df - a data frame created by calculated_keyness with cols: item, keyness
        num - the number of top and bottom ranked items to include
        c1/c2 - color for the bars
        corpusA/corpusB - labels/names of corpora
        
    Returns:
        matplotlib plot 
    '''
    def selc_df(df, x=2):
        return df.head(x).append(df.tail(x))

    tb_df=selc_df(df,num)
    
    yh=int(num/10)*5
    
    colors = [c1]*num + [c2]*num
    
    ax = tb_df.set_index('item')['keyness'].plot(kind='barh', zorder=2,
                                        figsize=(8, yh),
                                        color=colors, alpha=0.5, width=0.75)
    
    # Despine
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    
    
    # Draw vertical axis lines
    vals = ax.get_xticks()
    for tick in vals:
        ax.axvline(x=tick, linestyle='dashed', alpha=0.4, color='#eeeeee', zorder=1)

        
    ax.set_xlabel("Keyness", labelpad=20, weight='bold', size=12)

    # Set y-axis label
    ax.set_ylabel("")

    ax.annotate(f'Distinctive items\nin {corpusB}', (10,num+num/2), color=c2)
    ax.annotate(f'Distinctive items\nin {corpusA}', (-10,num/2), ha='right', color=c1)

    
    return ax

In [None]:
def tokenize(text, lowercase=False, strip_chars=''):
    '''create a list of tokens from a string by splitting on whitespace and applying optional normalization 
    
    Args:
        text        -- a string object containing the text to be tokenized
        lowercase   -- should text string be normalized as lowercase (default: False)
        strip_chars -- a string indicating characters to strip out of text, e.g. punctuation (default: empty string) 
        
    Return:
        A list of tokens
    '''
    
    # create a replacement dictionary from the
    # string of characters in the **strip_chars**
    rdict = str.maketrans('','',strip_chars)
    
    if lowercase:
        text = text.lower()
    
    tokens = text.translate(rdict).split()
    
    return tokens

In [None]:
def get_ngram_tokens(tokens, n=1):
    '''create a list of n-gram tokens from a list of tokens
    
    Args:
        tokens -- a list of tokens
        n      -- the size of the window to use to build n-gram token list
        
    Returns:
        
        list of n-gram strings (whitespace separated) of length n
    '''
    
    if n<2 or n>len(tokens):
        return tokens
    
    new_tokens = []
    
    for i in range(len(tokens)-n+1):
        new_tokens.append(" ".join(tokens[i:i+n]))
        
    return new_tokens

In [None]:
def make_kwic(kw, text, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line
        text -- a list of tokens for the text
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    hits = [(w,i) for i,w in enumerate(text) if w==kw]
    
    lines = []
    for hit in hits:
        left = text[hit[1]-win:hit[1]]
        kw = text[hit[1]]
        right = text[hit[1]+1 : hit[1]+win+1]
        
        
        left = ['']*(win-len(left)) + left if len(left)<win else left
        right = right+['']*(win-len(right)) if len(right)<win else right

        
        lines.append([left, kw, right])
        
    return lines

In [None]:
def print_kwic(kwic, win=None):
    '''A basic print function for a KWIC object
    
    Args:
        kwic -- a list of KWIC lines of the form [ [left words], kw, [right words]]
        win  -- if None then use all words provided in context otherwise limit by win
        
    Prints KWIC lines with left context width/padding win*8 characters
    '''
    
    if not kwic:
        return
    
    if win is None:
        win = len(kwic[0][0])
    
    for line in kwic:
        print("{: >{}}  {}  {}".format(' '.join(line[0][-win:]), 
                                      win*10, 
                                      line[1], 
                                      ' '.join(line[2][:win])
                                     )
             )    

In [None]:
def sort_kwic(kwic, order=None):
    ''' sort a kwic list using the passed positional arguments 
    
    Args:
        kwic   -- a list of lists [ [left tokens], kw, [right tokens]]
        order  -- a list of one or more positional arguments of form side-pos, e.g. L1, R3, L4 (default: None)
    
    Returns:
        kwic sorted for each positional argument in reverse, i.e. ['R1','L1'] sorts first by L1 and then R1
    '''
    if order is None:
        return kwic
   
    order = [order] if not type(order) is list else order
    order.reverse()
    
    for sort_term in order:
        if not re.match('[LR][1-4]', sort_term):
            pass
        
        pos1 = 0 if sort_term[0]=='L' else 2
        pos2 = int(sort_term[1])-1
        pos2 = 3-pos2 if sort_term[0]=='L' else pos2
        kwic.sort(key=lambda l : l[pos1][pos2])
    
    return kwic

In [None]:
def collocates(tokens, kw, win=[4,4]):
    '''return the collocates in a window around a given keyword
    
    Args:
          tokens -- a list of tokens
          kw     -- keyword string to find and get collocates for
          win    -- a list of number of tokens to left (index 0) and right (index 1) to use; default: [4,4]
    
    Returns:
          a list of contexts (matching window specification) around each instance of keyword in tokens
    '''
    hits = [p for p,t in enumerate(tokens) if t==kw]
    
    context=[]
    for hit in hits:
        left = [] if win[0]<1 else tokens[hit-win[0]:hit]
        right = [] if win[1]<1 else tokens[hit+1:hit+win[1]+1]
        
        context.extend(left)
        context.extend(right)
        
    return context

In [None]:
def get_colls(texts,kw, win=[4,4]):
    '''create a collocate frequency list for instances of a kw in a list of texts
    
    Args:
        texts  -- a list of tokenized texts
        kw     -- keyword string to find and get collocates for
        win    -- a list of number of tokens to left (index 0) and right (index 1) to use; default: [4,4]
    
    Returns:
        a list-of-tuples where each tuple is (collocate, freq_with_kw, coll_total_freq)
    '''
    word_dist = Counter()
    colls = Counter()
    for text, tokens in texts.items():
        word_dist.update(tokens)
        colls.update(collocates(tokens,kw, win))
    
    return [(str(k),v, word_dist[k]) for k,v in colls.items()], word_dist.get(kw), sum(word_dist.values())

In [None]:
def normalize_freq(freq, size, base=10000):
    '''normalize the frequency of an item based on the size of the text/corpus using a base, e.g. per 10,000 words
    
    Args:
        freq   --  the frequency of the item
        size   --  the size (number of tokens) in the text/corpus
        base   --  normalization unit (DEFAULT: 10,000 tokens)
    
    Returns:
        normalized frequency
    
    '''
    norm_freq = freq/size * base
    return norm_freq

In [None]:
#dist1_size = sum(dist1.values())
#dist2_size = sum(dist2.values())
#dist3_size = sum(dist3.values())
#dist4_size = sum(dist4.values())
#dist5_size = sum(dist5.values())

In [None]:
def compare_items(dist1, size1, dist2, size2, dist3, size3, dist4, size4, dist5, size5, items, scaling=10000, dp=15):
    ''' given two Counter objects with common keys compare the frequency and relative frequency of list of items
    
    Args:
        dist1    -- Counter frequency list object
        dist2    -- Counter frequency list object
        items    -- list of string items that should be keys in dist1 and dist2
        scaling  -- normalization factor, e.g. 10,000 words (default: 100000)
    
    Returns:
    
        list of tuples of form
            (item, item_freq_dist1, norm_item_freq_dist1, item_freq_dist2, norm_item_freq_dist2)
    '''
    
    
    item_comparison = []
    
    for item in items:
        
        d1_freq = dist1.get(item,0)
        d2_freq = dist2.get(item,0)
        d3_freq = dist3.get(item,0)
        d4_freq = dist4.get(item,0)
        d5_freq = dist5.get(item,0)
        
        item_comparison.append((item, 
                                d1_freq, round(d1_freq/size1*scaling, dp),
                                d2_freq, round(d2_freq/size2*scaling, dp),
                                d3_freq, round(d3_freq/size3*scaling, dp),
                                d4_freq, round(d4_freq/size4*scaling, dp),
                                d5_freq, round(d5_freq/size5*scaling, dp)))
    
    return item_comparison

In [None]:
def compare_item(dist1, dist2,dist3,dist4,dist5, items, scaling=10000, dp=15):
    ''' given two Counter objects with common keys compare the frequency and relative frequency of list of items
    
    Args:
        dist1    -- Counter frequency list object
        dist2    -- Counter frequency list object
        items    -- list of string items that should be keys in dist1 and dist2
        scaling  -- normalization factor, e.g. 10,000 words (default: 100000)
    
    Returns:
    
        list of tuples of form
            (item, item_freq_dist1, norm_item_freq_dist1, item_freq_dist2, norm_item_freq_dist2)
    '''
    
    size1 = sum(dist1.values())
    size2 = sum(dist2.values())
    size3 = sum(dist3.values())
    size4 = sum(dist4.values())
    size5 = sum(dist5.values())
    
    
    item_comparison = []
    
    for item in items:
        
        d1_freq = dist1.get(item,0)
        d2_freq = dist2.get(item,0)
        d3_freq = dist3.get(item,0)
        d4_freq = dist4.get(item,0)
        d5_freq = dist5.get(item,0)
        
        item_comparison.append((item, 
                                round(d1_freq/size1*scaling, dp),
                                round(d2_freq/size2*scaling, dp),
                                round(d3_freq/size3*scaling, dp),
                                round(d4_freq/size4*scaling, dp),
                                round(d5_freq/size5*scaling, dp)))
    
    return item_comparison

In [None]:
def show_keyitems(df, n=20, c1='red', c2='blue', corpusA='corpus A', corpusB='corpus B'):
    '''plot  top/bottom n items from a keyness analysis table
    
    Args:
        df - a data frame created by calculated_keyness with cols: item, keyness
        num - the number of top and bottom ranked items to include
        c1/c2 - color for the bars
    
    Returns:
        HTML string containing two column table
    '''
   
    template = '''
        <div style=' float:left; width: 40%; text-align: center'>
        <h3>{}</h3>
        {}</div>
       <div style='width: 40%; padding-left: 20px; float: left; '>
       <h3 style="text-align: center">{}</h3>
        {}</div>
    '''


    idiv = '''
            <div style="font-size: {}px; color: {}; margin-bottom: 2px; float: left; 
            margin: 10px; padding: 2px; background-color: #f7f7f7; border-radius: 6px">
            {}</div>
            '''
    
    top = df[['item', 'keyness']].head(n).values
    bottom = df[['item', 'keyness']].tail(n).values

    top_str = '\n'.join([idiv.format(3*math.log(kness), c1, item) for size, (item, kness) in enumerate(top,1)])
    bottom_str = '\n'.join([idiv.format(3*math.log(abs(kness)), c2, item) for size, (item, kness) in enumerate(bottom,1)])
    
    
    display(HTML(
        template.format(corpusA,top_str, corpusB, bottom_str)
    ))

In [None]:
def comparison_plot(comparison_data, label1='corpus 1', label2='corpus 2', label3='corpus 3', label4='corpus 4', label5='corpus 5'):
    ''' create a paired barplot of relative frequencies of items in two corpora
    
    Args:
        comparison_data --  list of tuples produced by the compare_items() function
        label1          --  legend label for first corpus (default: corpus 1)
        label2          --  legend label for second corpus (default: corpus 2)
        
    Produces a Seaborn barplot
    '''
    fig = plt.figure(figsize=(12,6))
    
    df=pd.DataFrame(comparison_data)[[0,2,4,6,8,10]] 
    df.columns = ['item', label1, label2, label3, label4, label5]
    df2=df.melt(id_vars=['item'])
    df2.columns=['item', 'corpus', 'frequency']
    sn.barplot(x='item',y='frequency', hue='corpus',data=df2, palette = 'OrRd')
    plt.show()

In [None]:
def compare_plot(comparison_data, label1='corpus 1', label2='corpus 2', label3='corpus 3', label4='corpus 4', label5='corpus 5'):
    ''' create a paired barplot of relative frequencies of items in two corpora
    
    Args:
        comparison_data --  list of tuples produced by the compare_items() function
        label1          --  legend label for first corpus (default: corpus 1)
        label2          --  legend label for second corpus (default: corpus 2)
        
    Produces a Seaborn barplot
    '''
    fig = plt.figure(figsize=(12,6))
    
    df=pd.DataFrame(comparison_data)[[0,1,2,3,4,5]] 
    df.columns = ['item', label1, label2, label3, label4, label5]
    df2=df.melt(id_vars=['item'])
    df2.columns=['item', 'corpus', 'frequency']
    sn.barplot(x='item',y='frequency', hue='corpus',data=df2, palette = 'OrRd')
    plt.show()

In [None]:
def plot_collocates(kw, collocate_list, num=20, show_freq=False, title=None, threshold=1):
    ''' Create a graph of the collocates of a keyword within a specified window and threshold
    
    Args:
        kw              -- keyword to place at center of graph
        collocate_list  -- Counter object of collocate frequencies
        num             -- the number of collocates (in descending frequency to display) [default=20]
        show_freq       -- whether to show frequency beside edge True/False [default=False]
        title           -- string to use as a title for the plot [default=None]
        threshold       -- frequency threshold for showing edges [default=1]
        
    '''
    cG = graphviz.Graph(engine='neato')
    cG.attr('graph', overlap='scalexy', size="6,6")
    if title:
        cG.attr('graph', label=title, labelloc='t', fontsize='20')
    for item, freq in collocate_list.most_common(num):
        if freq >= threshold:
            cG.edge(kw.upper(), item, penwidth=str(math.log(freq,2)), 
                    label=None if not show_freq else str(freq))
    
    return cG