## String processing with Python

Using a text corpus found on the cds-language GitHub repo or a corpus of your own found on a site such as Kaggle, write a Python script which calculates collocates for a specific keyword.

The script should take a directory of text files, a keyword, and a window size (number of words) as input parameters, and an output file called out/{filename}.csv
These parameters can be defined in the script itself
Find out how often each word collocates with the target across the corpus
Use this to calculate mutual information between the target word and all collocates across the corpus
Save result as a single file consisting of four columns: collocate, raw_frequency, MI

__Importing libraries__

In [1]:
import os
import sys 
sys.path.append(os.path.join("..")) # enabling communication with home directory
import pandas as pd 
from pathlib import Path
import csv 
import re
import string
import numpy as np

__Defining tokenizer function__

In [2]:
def tokenize(input_string):
    # Split at all characters except for letters (both lowercase and uppercase) and apostrophes
    tokenizer = re.compile(r"[^a-zA-Z']+") 
    # Tokenize
    token_list = tokenizer.split(input_string) # return a token list by splitting the input string using the compiling pattern
    # Return list of tokens
    return token_list

__Defining collocate function__

In [3]:
def collocates(path, keyword, window_size):
    
    token_list_all = []
    collocates_list = []
    data = pd.DataFrame(columns=["keyword", "collocate", "raw_frequency", "MI"])
    u = 0
    
    for filename in Path(path).glob("*.txt"):
        with open (filename, "r", encoding = "utf-8") as file:
            text = file.read()
            token_list = tokenize(text.lower())
            token_list_all.extend(token_list)
            indices = [index for index, x in enumerate(token_list) if x == keyword]
            u = u + len(indices)
            
            for index in indices:
                window_start = max(0, index - window_size)
                window_end = index + window_size
                keyword_string = token_list[window_start : window_end + 1]
                collocates_list.extend(keyword_string)
                collocates_list.remove(keyword)
                        
    unique_collocates = set(collocates_list)
    for collocate in unique_collocates:
        v = token_list_all.count(collocate)
        O11 = collocates_list.count(collocate)
        O12 = u - O11
        O21 = v - O11
        R1 = O11 + O12
        C1 = O11 + O21
        N = len(token_list_all)
        E11 = R1*C1/N
        MI = np.log(O11/E11)
        data = data.append({"keyword": keyword, 
                     "collocate": collocate, 
                     "raw_frequency": O11,
                     "MI": MI}, ignore_index = True)
        
    data = data.sort_values("MI", ascending = False)    
    return data

O11 = u & v = in lines from KWIC, how often do we have the collocate in it  
O12 = u & !v = total number of u’s - O11 <br>
O21 = !u & v = total number of v’s - O11 <br>
R1 = O11 + O12 <br>
C1 = O11 + O21

In [4]:
path = os.path.join("..", "data", "100_english_novels", "corpus")
collocates_df = collocates(path, "sunshine", 2)
collocates_df.to_csv("Collocates.csv", index = False)

In [5]:
collocates_df

Unnamed: 0,keyword,collocate,raw_frequency,MI
92,sunshine,lapt,1,9.926476
726,sunshine,grateless,1,9.926476
176,sunshine,slurs,1,9.233329
13,sunshine,wheriver,1,9.233329
718,sunshine,mayonaise,1,9.233329
...,...,...,...,...
275,sunshine,were,2,-0.163138
110,sunshine,not,4,-0.184615
633,sunshine,up,1,-0.526721
410,sunshine,you,4,-0.572201
