In [1]:
import matplotlib.pyplot as plt
from collections import Counter
import re
import json
import random

In [2]:
def tokenize(text, lowercase=False, stripchars=''):
    
    rdict = str.maketrans('','',stripchars)
    
    text_norm = text.translate(rdict)
    if lowercase:
        text_norm = text_norm.lower()
        
    tokens = text_norm.split()
    
    return tokens

In [3]:
def normalize_freq(freq, size, base=10000):
    '''normalize the frequency of an item based on the size of the text/corpus using a base, e.g. per 10,000 words
    
    Args:
        freq   --  the frequency of the item
        size   --  the size (number of tokens) in the text/corpus
        base   --  normalization unit (DEFAULT: 10,000 tokens)
    
    Returns:
        normalized frequency
    
    '''
    norm_freq = freq/size * base
    return norm_freq

In [4]:
def make_kwic(kw, texts, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line
        texts -- a list of lists of texts in the corpus
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    tokens = [tokenize(text, lowercase = True, stripchars = chars_to_strip)
                 for text in texts
             ]
    
    hits = [(word,idx, tidx) for tidx, text in enumerate(tokens) 
                                for idx,word in enumerate(text) if word==kw]
    lines = []
    for hit in hits:
        text = tokens[hit[2]]
        left = text[max(0,hit[1]-win):hit[1]]
        kw = text[hit[1]]
        right = text[hit[1]+1 : min(len(text),hit[1]+win)+1]
        
        if len(left)<win:
            left = ['']*(win-len(left)) + left
            
        if len(right)<win:
            right = right + ['']*(win-len(right))
        
        lines.append([left, kw, right])
        
    return lines

In [5]:
def sort_kwic(kwic, order=None):
    ''' sort a kwic list using the passed positional arguments 
    
    Args:
        kwic   -- a list of lists [ [left tokens], kw, [right tokens]]
        order  -- a list of one or more positional arguments of form side-pos, e.g. L1, R3, L4 (default: None)
    
    Returns:
        kwic sorted for each positional argument in reverse, i.e. ['R1','L1'] sorts first by L1 and then R1
    '''
    if order is None:
        return kwic
   
    order = [order] if not type(order) is list else order
    order.reverse()
    
    win = len(kwic[0][0])-1
    
    
    for sort_term in order:
        if not re.match('[LR][1-4]', sort_term):
            pass
        
        pos1 = 0 if sort_term[0]=='L' else 2
        pos2 = int(sort_term[1])-1
        pos2 = win-pos2 if sort_term[0]=='L' else pos2
        
        kwic.sort(key=lambda l : l[pos1][pos2])
    
    return kwic

In [6]:
def get_ngram_tokens(tokens, n=1):
    '''create a list of n-gram tokens from a list of tokens
    
    Args:
        tokens -- a list of tokens
        n      -- the size of the window to use to build n-gram token list
        
    Returns:
        
        list of n-gram strings (whitespace separated) of length n
    '''
    
    if n<2 or n>len(tokens):
        return tokens
    
    new_tokens = []
    
    for i in range(len(tokens)-n+1):
        new_tokens.append(" ".join(tokens[i:i+n]))
        
    return new_tokens

In [7]:
def print_kwic(kwic, win=None):
    '''A basic print function for a KWIC object
    
    Args:
        kwic -- a list of KWIC lines of the form [ [left words], kw, [right words]]
        win  -- if None then use all words provided in context otherwise limit by win
        
    Prints KWIC lines with left context width/padding win*8 characters
    '''
    
    if not kwic:
        return
    
    if win is None:
        win = len(kwic[0][0])

    max_left = max([len(' '.join(line[0])) for line in kwic])+1

        
    for lnum,line in enumerate(kwic,1):
        print("{:>2}.{:>{}.{}}  {}  {}".format(lnum, ' '.join(line[0][-win:]).encode().decode('unicode-escape'), 
                                      max_left, max_left,
                                      line[1], 
                                      ' '.join(line[2][:win]).encode().decode('unicode-escape')
                                     )
             ) 

In [8]:
def compare_items(dist1, dist2, items, scaling=10000, dp=2):
    ''' given two Counter objects with common keys compare the frequency and relative frequency of list of items
    
    Args:
        dist1    -- Counter frequency list object
        dist2    -- Counter frequency list object
        items    -- list of string items that should be keys in dist1 and dist2
        scaling  -- normalization factor, e.g. 10,000 words (default: 100000)
    
    Returns:
    
        list of tuples of form
            (item, item_freq_dist1, norm_item_freq_dist1, item_freq_dist2, norm_item_freq_dist2)
    '''
    dist1_size = sum(dist1.values())
    dist2_size = sum(dist2.values())

    item_comparison = []
    
    for item in items:
        
        d1_freq = dist1.get(item,0)
        d2_freq = dist2.get(item,0)
        
        item_comparison.append((item, 
                                d1_freq, round(d1_freq/dist1_size*scaling, dp),
                                d2_freq, round(d2_freq/dist2_size*scaling, dp)))
    
    return item_comparison
        

In [9]:
def comparison_plot(comparison_data, label1='corpus 1', label2='corpus 2'):
    ''' create a paired barplot of relative frequencies of items in two corpora
    
    Args:
        comparison_data --  list of tuples produced by the compare_items() function
        label1          --  legend label for first corpus (default: corpus 1)
        label2          --  legend label for second corpus (default: corpus 2)
        
    Produces a Seaborn barplot
    '''
    fig = plt.figure(figsize=(12,6))
    
    df=pd.DataFrame(comparison_data)[[0,2,4]] 
    df.columns = ['item', label1, label2]
    df2=df.melt(id_vars=['item'])
    df2.columns=['item', 'corpus', 'frequency']
    sn.barplot(x='item',y='frequency', hue='corpus',data=df2)
    plt.show()

In [10]:
def make_localized_rhyme(story_title, story_text, spaces):
    chars_to_strip = ',.\xa0:-()\';$"/?][!`Ą@Ś§¨’–“”…ï‘>&\\%˝˘*'

    title = story_title
    tokens = tokenize(story_text.lower(), stripchars=chars_to_strip)
    
    localized = []
    
    for index, token in enumerate(tokens):
        
        start = index - spaces
        while start < 0:
            start += 1
            
        stop = index + spaces + 1
        while stop > len(tokens):
            stop -= 1
        
        before = tokens[start:index]
        after = tokens[index +  1:stop]
        chunk = before + after

        word_freq = len([word for word in chunk if word == token]) # list comprehension: for word in chunk, append word if word == token; brackets represent a list, so take the length of the list as that represents the count, aka frequency, of the word
        
        rhyme = [word for word in chunk if word in pronouncing.rhymes(token)]
        rhyme_freq = len(rhyme) # list comprehension: for word in chunk, append word if word is in list of words that rhyme with token; brackets represent a list, so take the length of the list as that represents the count, aka frequency, of the word
        rhyme_unique_freq = len(set(rhyme))
        
        row = [title, token, chunk, word_freq, rhyme_freq, rhyme_unique_freq]
        localized.append(row)

    return localized

In [11]:
def make_localized_dataframe(localized):
    title_column = [row[0] for row in localized]
    token_column = [row[1] for row in localized] # more list comprehension: append variable(row), for variable(row) in list(localized)
    chunk_column = [", ".join(row[2]) for row in localized] # join function turns a list into a string using identified delimiter
    word_freq_column = [row[3] for row in localized]
    rhyme_freq_column = [row[4] for row in localized]
    rhyme_unique_freq_column = [row[5] for row in localized]
    
    data = {
        "Title": title_column,
        "Token": token_column,
        "Chunk": chunk_column,
        "Word Frequency": word_freq_column,
        "Rhyme Frequency": rhyme_freq_column,
        "Unique Rhyme Frequency": rhyme_unique_freq_column
    }
        
    df = pd.DataFrame(data)
    return df

### Bootstrap Functions:

In [15]:
# Function to generate the bootstrap

def bootstrap_mean(data1, data2):
    # Number of bootstrap iterations
    n_iterations = 10000
    bootstrap_diff = []

    # Perform bootstrapping
    for _ in range(n_iterations):
        # Resample with replacement
        sample1 = np.random.choice(data1, size=len(data1), replace=True)
        sample2 = np.random.choice(data2, size=len(data2), replace=True)
        
        # Compute the mean difference
        mean_diff = np.mean(sample1) - np.mean(sample2)
        bootstrap_diff.append(mean_diff)

    # Calculate the confidence interval
    lower_bound = np.percentile(bootstrap_diff, 2.5)
    upper_bound = np.percentile(bootstrap_diff, 97.5)

    # Results
    mean_diff_result = round(np.mean(bootstrap_diff),4)
    return mean_diff_result, (round(lower_bound,4), round((upper_bound),4))

In [14]:
# function to plot the bootstrap

import matplotlib.pyplot as plt
from scipy.stats import norm

def plot_bootstrap_diff(data1, data2):
    # Number of bootstrap iterations
    n_iterations = 10000
    bootstrap_diff = []

    # Perform bootstrapping
    for _ in range(n_iterations):
        # Resample with replacement
        sample1 = np.random.choice(data1, size=len(data1), replace=True)
        sample2 = np.random.choice(data2, size=len(data2), replace=True)
        
        # Compute the mean difference
        mean_diff = np.mean(sample1) - np.mean(sample2)
        bootstrap_diff.append(mean_diff)

    # Calculate the mean, standard deviation, and confidence interval
    mean = np.mean(bootstrap_diff)
    std = np.std(bootstrap_diff)
    lower_bound = np.percentile(bootstrap_diff, 2.5)
    upper_bound = np.percentile(bootstrap_diff, 97.5)

    # Create the histogram
    plt.figure(figsize=(10, 6))
    count, bins, _ = plt.hist(bootstrap_diff, bins=30, density=True, color='skyblue', edgecolor='black', alpha=0.7, label='Bootstrap Differences')

    # Plot the normal distribution line
    x = np.linspace(min(bins), max(bins), 1000)
    pdf = norm.pdf(x, mean, std)
    plt.plot(x, pdf, color='red', linewidth=2, label='Normal Distribution')

    # Add mean line
    plt.axvline(mean, color='darkgreen', linestyle='dashed', linewidth=1.5, label='Mean Difference')

    # Add confidence interval lines
    plt.axvline(lower_bound, color='purple', linestyle='dashed', linewidth=1.5, label='Lower CI (2.5%)')
    plt.axvline(upper_bound, color='orange', linestyle='dashed', linewidth=1.5, label='Upper CI (97.5%)')

    # Add labels, title, and legend
    plt.title('Bootstrap Distribution of Mean Differences', fontsize=14)
    plt.xlabel('Mean Difference', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    plt.legend()
    plt.grid(alpha=0.3)
    plt.show()
