This notebook calculates summary statistics on all corpora in the `adverbly_adjectives` directory.

It assumes POS information is in the `pos` directory, and that hapax legomena lists are in the `hapax_legomena` directory.

COCA is special since it's so massive. COCA POS files are handled specially and expected to be in a directory named `COCA/POS`.

Summary statistics calculated include the number of words, as well as the number and relative frequencies of adverbly adjective pairs and hapax legomena.

In [1]:
import os, re
import csv
import numpy as np
import pandas as pd
from collections import defaultdict
from zipfile import ZipFile

In [2]:
coca_genre_count = defaultdict(int)

# numbers from download version counts in coca_wordcount.xlsx

coca_genre_count['Academic'] = 103274418
coca_genre_count['News'] = 99507332
coca_genre_count['Fiction'] = 103422167
coca_genre_count['Magazine'] = 108315219
coca_genre_count['Spoken'] = 107967025

for genre in coca_genre_count.keys():
    print(genre, coca_genre_count[genre])
    
    
# LEGACY CODE WHERE TOKENS WERE CONTINUED - DISREGARD!

# # required to translate the shortened versions of genre names to something that looks better in a summary sheet
# genre_dict = {
#     'spok' : 'Spoken',
#     'mag' : 'Magazine',
#     'acad' : 'Academic',
#     'news' : 'News',
#     'fic' : 'Fiction'
# }

# pos_path = 'COCA/POS/'
# zipfiles = [pos_path+f for f in os.listdir(pos_path) if f.endswith('.zip')]

# def deleteDir(dir):
#     try:
#         for f in os.listdir(dir):
#             os.remove(dir+f)
#         os.rmdir(dir)
#     except:
#         pass

# tmp_dir = '__tmp/'

# deleteDir(tmp_dir)

# # extract each zip file to a temporary directory and count the number of lines (representing num of words for that genre)
# for f in zipfiles:
#     os.makedirs(tmp_dir)
    
#     genre = genre_dict[f.split('/')[-1].split('.zip')[0]]
#     with ZipFile(f, 'r') as zip_f:
#         zip_f.extractall(tmp_dir)
    
#     lines = 0
    
#     #  iterate over all files/texts in current genre
#     for f in os.listdir(tmp_dir):
#         with open(tmp_dir+f, 'r', encoding='latin-1') as fo:
            
#             # count lines in current file
#             for l in fo:
#                 if not l.startswith('##'):
#                     lines += 1
    
#     coca_genre_count[genre] = lines
#     deleteDir(tmp_dir)

Spoken 107967025
News 99507332
Magazine 108315219
Fiction 103422167
Academic 103274418


In [3]:
total_word_counts = [['corpus', 'num_words']]
genre_word_counts = [['corpus', 'genre', 'num_words']]

path = 'pos/'
suffix = '_pos.csv'

# appends previously calculated COCA counts to the new array
coca_all = 0
for g in coca_genre_count:
    genre_word_counts.append(['COCA', g, coca_genre_count[g]])
    coca_all += coca_genre_count[g]
total_word_counts.append(['COCA', coca_all])

# iterates over all POS files to get word counts
files = [ path+f for f in os.listdir(path) if f.endswith(suffix)]

for f in files:
    corpusname = f.split(path)[1].split(suffix)[0]
    print(corpusname)
    df = pd.read_csv(f)
    
    # throw out any null values (shouldn't be any)
    df.dropna(inplace=True)
    
    # throw out anything that doesn't have at least one letter in it
    df = df[~ df['token'].str.match('[^a-zA-Z]+$')]
    
    # genre counts where possible
    if 'genre' in df.columns:
        grouped = df.groupby('genre')
        for genre,group in grouped:
            genre_word_counts.append([corpusname, str(genre), len(group.index)])
            
    if 'decade' in df.columns:
        grouped = df.groupby('decade')
        for genre,group in grouped:
            genre_word_counts.append([corpusname, str(genre), len(group.index)])
    
    # number of words is just the length of this pruned dataframe, because every row has only one word
    total_word_counts.append([corpusname, len(df.index)])

NYT


In [4]:
total_word_counts

[['corpus', 'num_words'], ['COCA', 522486161], ['NYT', 16689298]]

In [5]:
genre_word_counts

[['corpus', 'genre', 'num_words'],
 ['COCA', 'Spoken', 107967025],
 ['COCA', 'News', 99507332],
 ['COCA', 'Magazine', 108315219],
 ['COCA', 'Fiction', 103422167],
 ['COCA', 'Academic', 103274418],
 ['NYT', '1915', 696],
 ['NYT', '1916', 852],
 ['NYT', '1921', 1060],
 ['NYT', '1923', 331],
 ['NYT', '1924', 41661],
 ['NYT', '1925', 173527],
 ['NYT', '1926', 103028],
 ['NYT', '1927', 88640],
 ['NYT', '1928', 144421],
 ['NYT', '1929', 167730],
 ['NYT', '1930', 150095],
 ['NYT', '1931', 177937],
 ['NYT', '1932', 205237],
 ['NYT', '1933', 196341],
 ['NYT', '1934', 171464],
 ['NYT', '1935', 155712],
 ['NYT', '1936', 175797],
 ['NYT', '1937', 53731],
 ['NYT', '1938', 50193],
 ['NYT', '1939', 131466],
 ['NYT', '1940', 144511],
 ['NYT', '1941', 115224],
 ['NYT', '1942', 103499],
 ['NYT', '1943', 86984],
 ['NYT', '1944', 94849],
 ['NYT', '1945', 59508],
 ['NYT', '1946', 71301],
 ['NYT', '1947', 104409],
 ['NYT', '1948', 110873],
 ['NYT', '1949', 137206],
 ['NYT', '1950', 104079],
 ['NYT', '1951',

In [6]:
total_adv_adj_counts = [['corpus', 'adv_adj_counts']]
genre_adv_adj_counts = [['corpus', 'genre', 'adv_adj_counts']]

# number of adverbly adjectives is just the length of the file containing the list for each corpus / genre, as applicable

basepath = 'adverbly_adjectives'
files = []
for corpus in os.listdir(basepath):
    for f_or_d in os.listdir(basepath+'/'+corpus):
        
        # handle genres
        if os.path.isdir(basepath+'/'+corpus+'/'+f_or_d):
            genres_list = [basepath+'/'+corpus+'/'+f_or_d+'/'+f for f in os.listdir(basepath+'/'+corpus+'/'+f_or_d)]
            corpus_count = 0
            for f in genres_list:
                genre = f.split(f_or_d+'/')[1].split('_pairs.csv')[0]
                df = pd.read_csv(f)
                genre_count = len(df.index)
                genre_adv_adj_counts.append([corpus, str(genre), genre_count])
                
        # handle master lists for each corpus
        else:
            assert os.path.isfile(basepath+'/'+corpus+'/'+f_or_d)
            file = basepath+'/'+corpus+'/'+f_or_d
            df = pd.read_csv(file)
            count = len(df.index)
            total_adv_adj_counts.append([corpus, count])

In [7]:
total_adv_adj_counts

[['corpus', 'adv_adj_counts'],
 ['Cornell', 4287],
 ['Movies', 15761],
 ['sfu_review', 596],
 ['COCA', 192751],
 ['NYT', 45771],
 ['SOCC', 31912],
 ['CORE', 42544]]

In [8]:
genre_adv_adj_counts

[['corpus', 'genre', 'adv_adj_counts'],
 ['sfu_review', 'HOTELS', 89],
 ['sfu_review', 'MOVIES', 95],
 ['sfu_review', 'BOOKS', 53],
 ['sfu_review', 'COOKWARE', 34],
 ['sfu_review', 'PHONES', 31],
 ['sfu_review', 'CARS', 139],
 ['sfu_review', 'COMPUTERS', 82],
 ['sfu_review', 'MUSIC', 112],
 ['COCA', 'Magazine', 75984],
 ['COCA', 'Academic', 63999],
 ['COCA', 'Fiction', 54574],
 ['COCA', 'News', 54082],
 ['COCA', 'Spoken', 34729],
 ['NYT', '1927', 230],
 ['NYT', '1928', 388],
 ['NYT', '1997', 1119],
 ['NYT', '2010', 1478],
 ['NYT', '1965', 444],
 ['NYT', '1942', 386],
 ['NYT', '1976', 346],
 ['NYT', '1989', 856],
 ['NYT', '1993', 1221],
 ['NYT', '2012', 1544],
 ['NYT', '1933', 540],
 ['NYT', '1957', 417],
 ['NYT', '1931', 368],
 ['NYT', '1987', 943],
 ['NYT', '1979', 650],
 ['NYT', '2001', 1136],
 ['NYT', '1958', 538],
 ['NYT', '1983', 765],
 ['NYT', '2013', 1529],
 ['NYT', '1975', 262],
 ['NYT', '1951', 394],
 ['NYT', '1956', 353],
 ['NYT', '2017', 1346],
 ['NYT', '1960', 406],
 ['NYT'

In [9]:
total_hapax_legomena_counts = [['corpus', 'hapax_legomena_counts']]
genre_hapax_legomena_counts = [['corpus', 'genre', 'hapax_legomena_counts']]

# number of adverbly adjectives is just the length of the file containing the list for each corpus / genre, as applicable

basepath = 'hapax_legomena'
files = []
for corpus in os.listdir(basepath):
    for f_or_d in os.listdir(basepath+'/'+corpus):
        if os.path.isdir(basepath+'/'+corpus+'/'+f_or_d):
            genres_list = [basepath+'/'+corpus+'/'+f_or_d+'/'+f for f in os.listdir(basepath+'/'+corpus+'/'+f_or_d)]
            corpus_count = 0
            for f in genres_list:
                genre = f.split(f_or_d+'/')[1].split('_pairs.csv')[0]
                df = pd.read_csv(f)
                genre_count = len(df.index)
                genre_hapax_legomena_counts.append([corpus, str(genre), genre_count])
                
        else:
            assert os.path.isfile(basepath+'/'+corpus+'/'+f_or_d)
            file = basepath+'/'+corpus+'/'+f_or_d
            df = pd.read_csv(file)
            count = len(df.index)
            total_hapax_legomena_counts.append([corpus, count])

In [10]:
total_hapax_legomena_counts

[['corpus', 'hapax_legomena_counts'],
 ['Cornell', 3710],
 ['Movies', 13977],
 ['sfu_review', 550],
 ['COCA', 128845],
 ['NYT', 37118],
 ['SOCC', 22022],
 ['CORE', 30276]]

In [11]:
genre_hapax_legomena_counts

[['corpus', 'genre', 'hapax_legomena_counts'],
 ['sfu_review', 'HOTELS', 86],
 ['sfu_review', 'MOVIES', 93],
 ['sfu_review', 'BOOKS', 52],
 ['sfu_review', 'COOKWARE', 31],
 ['sfu_review', 'PHONES', 28],
 ['sfu_review', 'CARS', 131],
 ['sfu_review', 'COMPUTERS', 79],
 ['sfu_review', 'MUSIC', 102],
 ['COCA', 'Magazine', 54816],
 ['COCA', 'Academic', 45509],
 ['COCA', 'Fiction', 41222],
 ['COCA', 'News', 39213],
 ['COCA', 'Spoken', 23138],
 ['NYT', '1927', 211],
 ['NYT', '1928', 355],
 ['NYT', '1997', 1081],
 ['NYT', '2010', 1410],
 ['NYT', '1965', 434],
 ['NYT', '1942', 378],
 ['NYT', '1976', 337],
 ['NYT', '1989', 833],
 ['NYT', '1993', 1182],
 ['NYT', '2012', 1467],
 ['NYT', '1933', 477],
 ['NYT', '1957', 414],
 ['NYT', '1931', 329],
 ['NYT', '1987', 901],
 ['NYT', '1979', 632],
 ['NYT', '2001', 1089],
 ['NYT', '1958', 523],
 ['NYT', '1983', 745],
 ['NYT', '2013', 1461],
 ['NYT', '1975', 255],
 ['NYT', '1951', 390],
 ['NYT', '1956', 347],
 ['NYT', '2017', 1300],
 ['NYT', '1960', 401],


In [12]:
# converting all arrays to dataframes for quick and easy calculations later

df1 = pd.DataFrame(total_word_counts[1:], columns=total_word_counts[0])
df2 = pd.DataFrame(total_adv_adj_counts[1:], columns=total_adv_adj_counts[0])
df3 = pd.DataFrame(total_hapax_legomena_counts[1:], columns=total_hapax_legomena_counts[0])

In [13]:
# merge all dataframes into one big one

corpus_df = df1.merge(df2, on='corpus').merge(df3, on='corpus')
corpus_df

Unnamed: 0,corpus,num_words,adv_adj_counts,hapax_legomena_counts
0,COCA,522486161,192751,128845
1,NYT,16689298,45771,37118


In [14]:
# calculate relative frequencies

corpus_df['adv_adj_freqmil'] = corpus_df['adv_adj_counts'] / corpus_df['num_words'] * 1000000
corpus_df['hapax_legomena_freqmil'] = corpus_df['hapax_legomena_counts'] / corpus_df['num_words'] * 1000000

In [15]:
corpus_df

Unnamed: 0,corpus,num_words,adv_adj_counts,hapax_legomena_counts,adv_adj_freqmil,hapax_legomena_freqmil
0,COCA,522486161,192751,128845,368.911206,246.599833
1,NYT,16689298,45771,37118,2742.535965,2224.059993


In [16]:
# same process for genres

df1 = pd.DataFrame(genre_word_counts[1:], columns=genre_word_counts[0])
df2 = pd.DataFrame(genre_adv_adj_counts[1:], columns=genre_adv_adj_counts[0])
df3 = pd.DataFrame(genre_hapax_legomena_counts[1:], columns=genre_hapax_legomena_counts[0])

In [17]:
df1

Unnamed: 0,corpus,genre,num_words
0,COCA,Spoken,107967025
1,COCA,News,99507332
2,COCA,Magazine,108315219
3,COCA,Fiction,103422167
4,COCA,Academic,103274418
5,NYT,1915,696
6,NYT,1916,852
7,NYT,1921,1060
8,NYT,1923,331
9,NYT,1924,41661


In [18]:
df2

Unnamed: 0,corpus,genre,adv_adj_counts
0,sfu_review,HOTELS,89
1,sfu_review,MOVIES,95
2,sfu_review,BOOKS,53
3,sfu_review,COOKWARE,34
4,sfu_review,PHONES,31
5,sfu_review,CARS,139
6,sfu_review,COMPUTERS,82
7,sfu_review,MUSIC,112
8,COCA,Magazine,75984
9,COCA,Academic,63999


In [19]:
genre_df = df1.merge(df2, on=['corpus', 'genre']).merge(df3, on=['corpus', 'genre'])#.merge(df4, on=['corpus', 'genre'])
genre_df

Unnamed: 0,corpus,genre,num_words,adv_adj_counts,hapax_legomena_counts
0,COCA,Spoken,107967025,34729,23138
1,COCA,News,99507332,54082,39213
2,COCA,Magazine,108315219,75984,54816
3,COCA,Fiction,103422167,54574,41222
4,COCA,Academic,103274418,63999,45509
5,NYT,1915,696,2,2
6,NYT,1916,852,0,0
7,NYT,1921,1060,1,1
8,NYT,1923,331,0,0
9,NYT,1924,41661,94,93


In [20]:
genre_df['adv_adj_freqmil'] = genre_df['adv_adj_counts'] / genre_df['num_words'] * 1000000
genre_df['hapax_legomena_freqmil'] = genre_df['hapax_legomena_counts'] / genre_df['num_words'] * 1000000

In [21]:
genre_df

Unnamed: 0,corpus,genre,num_words,adv_adj_counts,hapax_legomena_counts,adv_adj_freqmil,hapax_legomena_freqmil
0,COCA,Spoken,107967025,34729,23138,321.663026,214.306174
1,COCA,News,99507332,54082,39213,543.497639,394.071464
2,COCA,Magazine,108315219,75984,54816,701.508068,506.078467
3,COCA,Fiction,103422167,54574,41222,527.681846,398.579929
4,COCA,Academic,103274418,63999,45509,619.698481,440.660920
5,NYT,1915,696,2,2,2873.563218,2873.563218
6,NYT,1916,852,0,0,0.000000,0.000000
7,NYT,1921,1060,1,1,943.396226,943.396226
8,NYT,1923,331,0,0,0.000000,0.000000
9,NYT,1924,41661,94,93,2256.306858,2232.303593


In [22]:
# write results to 2 CSV files

corpus_df.to_csv('corpus_stats.csv', index=None)
genre_df.to_csv('genre_stats.csv', index=None)