In [None]:
import pandas as pd
import csv
import os
import io
import zipfile
import random

## Read data

In [None]:
#_dir = "/resources/corpora/COHA/text/"
_dir = "/resources/corpora/COHA/CCOHA/tagged/"
#_dir = "/resources/corpora/COHA/ALL/"
files = sorted(os.listdir(_dir))
files

In [None]:
#z = zipfile.ZipFile(os.path.join(_dir, 'text_1810s_kso.zip'))
#zinfos = z.infolist()

#zip_file    = zipfile.ZipFile(os.path.join(_dir, 'text_1810s_kso.zip'))
#items_file  = zip_file.open(zip_file, 'r')
#items_file  = io.TextIOWrapper(io.BytesIO(items_file.read()))

In [None]:
df_list = []
for f in files:
    year = f.split("_")[1].split(".")[0][:-1]
    if os.path.basename(os.path.normpath(_dir)) == "ALL":
        df_decade = pd.read_csv(os.path.join(_dir, f), sep='\t', quotechar='"', quoting=csv.QUOTE_NONE, header=None, usecols=[2,3,4], encoding = "latin")
    elif os.path.basename(os.path.normpath(_dir)) == "tagged":
        z = zipfile.ZipFile(os.path.join(_dir, f))
        zinfos = z.infolist()
        for zinfo in zinfos:
            df_decade = pd.read_csv(z.open(zinfo.filename), sep='\t', quotechar='"', quoting=csv.QUOTE_NONE, header=None, usecols=[0,1,2], encoding = "utf-8")
            df_decade.columns = ['token', 'lemma', 'pos']
            df_decade["decade"] = year
            df_list.append(df_decade)
df = pd.concat(df_list)

## Cleanup

In [None]:
df = df[~df['pos'].str.contains('<sub>', na=False)].reset_index(drop = True)
df

## Get number of tokens per decade

In [None]:
decade_token_counts = df.decade.value_counts().to_frame()
decade_token_counts.columns=['tokencount']
decade_token_counts

In [None]:
decade_token_counts.to_csv('coha_year_token_count.csv', index = True)

## Get adjacent N-N

Only keep exactly two nn1-nn1 adjacent tokens, discard patterns with more, e.g. nn1-nn1-nn1. Also disallow the token after nn1-nn1 to be vhd, because in the older data there is a lot of "'d" and often the verb before gets tagged as a noun

In [None]:
index = pd.Index(df[df["pos"].str.match("^nn1$") & 
                    df["pos"].shift(-1).str.match("^nn1$") & 
                    ~(df["pos"].shift(-2).str.match("^nn1$").astype("bool") |
                      df["pos"].shift(-2).str.contains("vhd").astype("bool"))].index)
index_next = index + 1
index_full = index.union(index_next)
nn = df.loc[index_full]
nn

In [None]:
nn.tail(10)

In [None]:
index_window=index
window = 10
window_end = index+(window+1)
window_begin = index-window
for i in range(-window,window+2,1):
    index_window = index_window.union(index+i)
df_windowed = df.loc[index_window]
df_windowed.head(60)

In [None]:
df_windowed["span"] = "i"
df_windowed["span"][window_begin] = "b"
df_windowed.head(60)

In [None]:
windows = {}
for i, row in df_windowed.iterrows():
    if row["span"] == "b":
        window_content = []
        for j in range(i,i+((window*2)+2)):
            window_content.append(str(df_windowed.loc[j].lemma) + "_" + str(df_windowed.loc[j].pos))
        if df_windowed.loc[j].decade not in windows:
            windows[df_windowed.loc[j].decade] = [window_content]
        else:
            windows[df_windowed.loc[j].decade].append(window_content)

In [None]:
# Remove randomly a left or right context.
# This replicates 5-grams
windows_fivegrams = {}
for d in windows:
    for w in windows[d]:
        random.seed(1991)
        n = random.randint(0,1)
        if n == 0:
            if d in windows_fivegrams:
                windows_fivegrams[d].append(w[1:])
            else:
                windows_fivegrams[d] = [w[1:]]
        if n == 1:
            if d in windows_fivegrams:
                windows_fivegrams[d].append(w[:-1])
            else:
                windows_fivegrams[d] = [w[:-1]]
windows = windows_fivegrams

In [None]:
tmp_list = []
for d in windows:
    for w in windows[d]:
        tmp_list.append({'ngram': " ".join(w), 'year': d, 'match_count': 1, 'volume_count': 1})
df_ngrams = pd.DataFrame(tmp_list)
df_ngrams.head(60)

In [None]:
df_ngrams.groupby(['ngram', 'year'])['match_count'].sum().to_frame()
df_ngrams.head(60)

In [None]:
#df_ngrams.to_csv('coha_compounds/coha_fivegrams.tsv', index = False, header=False, sep="\t")
df_ngrams.to_csv('coha_compounds/coha_twelvegrams.tsv', index = False, header=False, sep="\t")