This notebook calculates summary statistics on all corpora in the `adverbly_adjectives` directory.

It assumes POS information is in the `pos` directory, and that hapax legomena lists are in the `hapax_legomena` directory.

COCA is special since it's so massive. COCA POS files are handled specially and expected to be in a directory named `COCA/POS`.

Summary statistics calculated include the number of words, as well as the number and relative frequencies of adverbly adjective pairs and hapax legomena.

In [1]:
import os, re
import csv
import numpy as np
import pandas as pd
from collections import defaultdict
from zipfile import ZipFile

In [2]:
# required to translate the shortened versions of genre names to something that looks better in a summary sheet
genre_dict = {
    'spok' : 'Spoken',
    'mag' : 'Magazine',
    'acad' : 'Academic',
    'news' : 'News',
    'fic' : 'Fiction'
}

pos_path = 'COCA/POS/'
zipfiles = [pos_path+f for f in os.listdir(pos_path) if f.endswith('.zip')]

def deleteDir(dir):
    try:
        for f in os.listdir(dir):
            os.remove(dir+f)
        os.rmdir(dir)
    except:
        pass

coca_genre_count = defaultdict(int)

tmp_dir = '__tmp/'

deleteDir(tmp_dir)

# extract each zip file to a temporary directory and count the number of lines (representing num of words for that genre)
for f in zipfiles:
    os.makedirs(tmp_dir)
    
    genre = genre_dict[f.split('/')[-1].split('.zip')[0]]
    with ZipFile(f, 'r') as zip_f:
        zip_f.extractall(tmp_dir)
    
    lines = 0
    
    #  iterate over all files/texts in current genre
    for f in os.listdir(tmp_dir):
        with open(tmp_dir+f, 'r', encoding='latin-1') as fo:
            
            # count lines in current file
            for l in fo:
                if not l.startswith('##'):
                    lines += 1
    
    coca_genre_count[genre] = lines
    
    print(genre, coca_genre_count[genre])
    deleteDir(tmp_dir)

Academic 120484732
News 125553380
Fiction 125360785
Magazine 127689724
Spoken 131059025


In [3]:
total_word_counts = [['corpus', 'num_words']]
genre_word_counts = [['corpus', 'genre', 'num_words']]

path = 'pos/'
suffix = '_pos.csv'

# appends previously calculated COCA counts to the new array
coca_all = 0
for g in coca_genre_count:
    genre_word_counts.append(['COCA', g, coca_genre_count[g]])
    coca_all += coca_genre_count[g]
total_word_counts.append(['COCA', coca_all])

# iterates over all POS files to get word counts
files = [ path+f for f in os.listdir(path) if f.endswith(suffix)]

for f in files:
    corpusname = f.split(path)[1].split(suffix)[0]
    print(corpusname)
    df = pd.read_csv(f)
    
    # throw out any null values (shouldn't be any)
    df.dropna(inplace=True)
    
    # throw out anything that doesn't have at least one letter in it
    df = df[~ df['token'].str.match('[^a-zA-Z]+$')]
    
    # genre counts where possible
    if 'genre' in df.columns:
        grouped = df.groupby('genre')
        for genre,group in grouped:
            genre_word_counts.append([corpusname, genre, len(group.index)])
    
    # number of words is just the length of this pruned dataframe, because every row has only one word
    total_word_counts.append([corpusname, len(df.index)])

sfu_review
Movies
CORE
SOCC
Cornell


In [4]:
total_word_counts

[['corpus', 'num_words'],
 ['COCA', 630147646],
 ['sfu_review', 255180],
 ['Movies', 3957840],
 ['CORE', 51452856],
 ['SOCC', 38046009],
 ['Cornell', 1281957]]

In [5]:
genre_word_counts

[['corpus', 'genre', 'num_words'],
 ['COCA', 'Academic', 120484732],
 ['COCA', 'News', 125553380],
 ['COCA', 'Spoken', 131059025],
 ['COCA', 'Magazine', 127689724],
 ['COCA', 'Fiction', 125360785],
 ['sfu_review', 'BOOKS', 25376],
 ['sfu_review', 'CARS', 47261],
 ['sfu_review', 'COMPUTERS', 40949],
 ['sfu_review', 'COOKWARE', 21119],
 ['sfu_review', 'HOTELS', 32257],
 ['sfu_review', 'MOVIES', 32207],
 ['sfu_review', 'MUSIC', 41813],
 ['sfu_review', 'PHONES', 14198],
 ['CORE', 'How-to / Instructional', 1656046],
 ['CORE', 'Information Description', 10949997],
 ['CORE', 'Information Persuasion', 1214201],
 ['CORE', 'Interactive Discussion', 2958960],
 ['CORE', 'Lyrical', 301283],
 ['CORE', 'Narrative', 17918383],
 ['CORE', 'Opinion', 15536270],
 ['CORE', 'Spoken', 917716]]

In [6]:
total_adv_adj_counts = [['corpus', 'adv_adj_counts']]
genre_adv_adj_counts = [['corpus', 'genre', 'adv_adj_counts']]

# number of adverbly adjectives is just the length of the file containing the list for each corpus / genre, as applicable

basepath = 'adverbly_adjectives'
files = []
for corpus in os.listdir(basepath):
    for f_or_d in os.listdir(basepath+'/'+corpus):
        
        # handle genres
        if os.path.isdir(basepath+'/'+corpus+'/'+f_or_d):
            genres_list = [basepath+'/'+corpus+'/'+f_or_d+'/'+f for f in os.listdir(basepath+'/'+corpus+'/'+f_or_d)]
            corpus_count = 0
            for f in genres_list:
                genre = f.split(f_or_d+'/')[1].split('_pairs.csv')[0]
                df = pd.read_csv(f)
                genre_count = len(df.index)
                genre_adv_adj_counts.append([corpus, genre, genre_count])
                
        # handle master lists for each corpus
        else:
            assert os.path.isfile(basepath+'/'+corpus+'/'+f_or_d)
            file = basepath+'/'+corpus+'/'+f_or_d
            df = pd.read_csv(file)
            count = len(df.index)
            total_adv_adj_counts.append([corpus, count])

In [7]:
total_adv_adj_counts

[['corpus', 'adv_adj_counts'],
 ['Cornell', 4287],
 ['Movies', 15761],
 ['sfu_review', 596],
 ['COCA', 192751],
 ['SOCC', 31912],
 ['CORE', 42544]]

In [8]:
genre_adv_adj_counts

[['corpus', 'genre', 'adv_adj_counts'],
 ['sfu_review', 'HOTELS', 89],
 ['sfu_review', 'MOVIES', 95],
 ['sfu_review', 'BOOKS', 53],
 ['sfu_review', 'COOKWARE', 34],
 ['sfu_review', 'PHONES', 31],
 ['sfu_review', 'CARS', 139],
 ['sfu_review', 'COMPUTERS', 82],
 ['sfu_review', 'MUSIC', 112],
 ['COCA', 'Magazine', 75984],
 ['COCA', 'Academic', 63999],
 ['COCA', 'Fiction', 54574],
 ['COCA', 'News', 54082],
 ['COCA', 'Spoken', 34729],
 ['CORE', 'Interactive_Discussion', 3466],
 ['CORE', 'Information_Persuasion', 1901],
 ['CORE', 'Narrative', 17223],
 ['CORE', 'Opinion', 20581],
 ['CORE', 'Lyrical', 168],
 ['CORE', 'Information_Description', 11401],
 ['CORE', 'How-to__Instructional', 1868],
 ['CORE', 'Spoken', 1457]]

In [9]:
total_hapax_legomena_counts = [['corpus', 'hapax_legomena_counts']]
genre_hapax_legomena_counts = [['corpus', 'genre', 'hapax_legomena_counts']]

# number of adverbly adjectives is just the length of the file containing the list for each corpus / genre, as applicable

basepath = 'hapax_legomena'
files = []
for corpus in os.listdir(basepath):
    for f_or_d in os.listdir(basepath+'/'+corpus):
        if os.path.isdir(basepath+'/'+corpus+'/'+f_or_d):
            genres_list = [basepath+'/'+corpus+'/'+f_or_d+'/'+f for f in os.listdir(basepath+'/'+corpus+'/'+f_or_d)]
            corpus_count = 0
            for f in genres_list:
                genre = f.split(f_or_d+'/')[1].split('_pairs.csv')[0]
                df = pd.read_csv(f)
                genre_count = len(df.index)
                genre_hapax_legomena_counts.append([corpus, genre, genre_count])
                
        else:
            assert os.path.isfile(basepath+'/'+corpus+'/'+f_or_d)
            file = basepath+'/'+corpus+'/'+f_or_d
            df = pd.read_csv(file)
            count = len(df.index)
            total_hapax_legomena_counts.append([corpus, count])

In [10]:
total_hapax_legomena_counts

[['corpus', 'hapax_legomena_counts'],
 ['Cornell', 3710],
 ['Movies', 13977],
 ['sfu_review', 550],
 ['COCA', 128845],
 ['SOCC', 22022],
 ['CORE', 30276]]

In [11]:
genre_hapax_legomena_counts

[['corpus', 'genre', 'hapax_legomena_counts'],
 ['sfu_review', 'HOTELS', 86],
 ['sfu_review', 'MOVIES', 93],
 ['sfu_review', 'BOOKS', 52],
 ['sfu_review', 'COOKWARE', 31],
 ['sfu_review', 'PHONES', 28],
 ['sfu_review', 'CARS', 131],
 ['sfu_review', 'COMPUTERS', 79],
 ['sfu_review', 'MUSIC', 102],
 ['COCA', 'Magazine', 54816],
 ['COCA', 'Academic', 45509],
 ['COCA', 'Fiction', 41222],
 ['COCA', 'News', 39213],
 ['COCA', 'Spoken', 23138],
 ['CORE', 'Interactive_Discussion', 2535],
 ['CORE', 'Information_Persuasion', 1522],
 ['CORE', 'Narrative', 13011],
 ['CORE', 'Opinion', 15686],
 ['CORE', 'Lyrical', 152],
 ['CORE', 'Information_Description', 8812],
 ['CORE', 'How-to__Instructional', 1570],
 ['CORE', 'Spoken', 1236]]

In [12]:
# converting all arrays to dataframes for quick and easy calculations later

df1 = pd.DataFrame(total_word_counts[1:], columns=total_word_counts[0])
df2 = pd.DataFrame(total_adv_adj_counts[1:], columns=total_adv_adj_counts[0])
df3 = pd.DataFrame(total_hapax_legomena_counts[1:], columns=total_hapax_legomena_counts[0])

In [13]:
# merge all dataframes into one big one

corpus_df = df1.merge(df2, on='corpus').merge(df3, on='corpus')
corpus_df

Unnamed: 0,corpus,num_words,adv_adj_counts,hapax_legomena_counts
0,COCA,630147646,192751,128845
1,sfu_review,255180,596,550
2,Movies,3957840,15761,13977
3,CORE,51452856,42544,30276
4,SOCC,38046009,31912,22022
5,Cornell,1281957,4287,3710


In [14]:
# calculate relative frequencies

corpus_df['adv_adj_freqmil'] = corpus_df['adv_adj_counts'] / corpus_df['num_words'] * 1000000
corpus_df['hapax_legomena_freqmil'] = corpus_df['hapax_legomena_counts'] / corpus_df['num_words'] * 1000000

In [15]:
corpus_df

Unnamed: 0,corpus,num_words,adv_adj_counts,hapax_legomena_counts,adv_adj_freqmil,hapax_legomena_freqmil
0,COCA,630147646,192751,128845,305.882282,204.467954
1,sfu_review,255180,596,550,2335.606239,2155.341328
2,Movies,3957840,15761,13977,3982.222626,3531.471712
3,CORE,51452856,42544,30276,826.854004,588.422147
4,SOCC,38046009,31912,22022,838.773917,578.825495
5,Cornell,1281957,4287,3710,3344.105926,2894.012826


In [16]:
# same process for genres

df1 = pd.DataFrame(genre_word_counts[1:], columns=genre_word_counts[0])
df2 = pd.DataFrame(genre_adv_adj_counts[1:], columns=genre_adv_adj_counts[0])
df3 = pd.DataFrame(genre_hapax_legomena_counts[1:], columns=genre_hapax_legomena_counts[0])

In [17]:
genre_df = df1.merge(df2, on=['corpus', 'genre']).merge(df3, on=['corpus', 'genre'])#.merge(df4, on=['corpus', 'genre'])
genre_df

Unnamed: 0,corpus,genre,num_words,adv_adj_counts,hapax_legomena_counts
0,COCA,Academic,120484732,63999,45509
1,COCA,News,125553380,54082,39213
2,COCA,Spoken,131059025,34729,23138
3,COCA,Magazine,127689724,75984,54816
4,COCA,Fiction,125360785,54574,41222
5,sfu_review,BOOKS,25376,53,52
6,sfu_review,CARS,47261,139,131
7,sfu_review,COMPUTERS,40949,82,79
8,sfu_review,COOKWARE,21119,34,31
9,sfu_review,HOTELS,32257,89,86


In [18]:
genre_df['adv_adj_freqmil'] = genre_df['adv_adj_counts'] / genre_df['num_words'] * 1000000
genre_df['hapax_legomena_freqmil'] = genre_df['hapax_legomena_counts'] / genre_df['num_words'] * 1000000

In [19]:
genre_df

Unnamed: 0,corpus,genre,num_words,adv_adj_counts,hapax_legomena_counts,adv_adj_freqmil,hapax_legomena_freqmil
0,COCA,Academic,120484732,63999,45509,531.179336,377.715908
1,COCA,News,125553380,54082,39213,430.749057,312.321341
2,COCA,Spoken,131059025,34729,23138,264.987474,176.546407
3,COCA,Magazine,127689724,75984,54816,595.067462,429.290614
4,COCA,Fiction,125360785,54574,41222,435.3355,328.826913
5,sfu_review,BOOKS,25376,53,52,2088.587642,2049.180328
6,sfu_review,CARS,47261,139,131,2941.114238,2771.841476
7,sfu_review,COMPUTERS,40949,82,79,2002.490903,1929.229041
8,sfu_review,COOKWARE,21119,34,31,1609.924712,1467.872532
9,sfu_review,HOTELS,32257,89,86,2759.09105,2666.087981


In [20]:
# write results to 2 CSV files

corpus_df.to_csv('corpus_stats.csv', index=None)
genre_df.to_csv('genre_stats.csv', index=None)