In [4]:
import os, re
import csv
import numpy as np
import pandas as pd
from collections import defaultdict
from zipfile import ZipFile

In [5]:
basepath = 'adverbly_adjectives'
os.makedirs(basepath, exist_ok=True)

In [7]:
def extractAdverblyAdjectives(corpusname, textnums, tokens, pos, genres=pd.DataFrame(), bygenre=True):
    assert tokens.shape == textnums.shape, "Tokens must be the same length as the text numbers"
    assert textnums.shape == pos.shape, "Parts of speech must be the same length as the text numbers"
    if bygenre:
        assert genres.shape == tokens.shape, "Genres must be the same length as the number of texts"
    
    print(corpusname)
    
    corpuspath = basepath + '/' + corpusname 
    os.makedirs(corpuspath, exist_ok=True)
    
    if bygenre:
        df = pd.concat([genres, textnums, tokens, pos], axis=1)
        print(df.head())
    else:
        df = pd.concat([textnums, tokens, pos], axis=1)
    
    if bygenre:
        
        genrespath = corpuspath + '/' + 'genres'
        os.makedirs(genrespath, exist_ok=True)
        
        genre_grouped = df.groupby('genre')
        
        for genre,genre_group in genre_grouped:
            num_tokens = len(genre_group.index)
            print(genre, num_tokens)
            
            grouped = genre_group.groupby('text_num')
            
            genre_df = pd.DataFrame()

            for name,group in grouped:
                next_pos = group['pos'][1:].reset_index(drop=True).rename('next_pos')
                next_token = group['token'][1:].reset_index(drop=True).rename('next_token')
        
                group = group.reset_index(drop=True)

                # Throw out last word
                group = pd.concat([group, next_pos, next_token], axis=1).dropna()
                
                # Keep only fully alphabetical words
                group = group[group['token'].str.isalpha() & group['next_token'].str.isalpha()]
                
                # Keep only the pairs where the first token ends in ly
                group = group[group['token'].str.endswith('ly')]
                
                # Throw out 'only'
                group = group[group['token'] != 'only']
                
                # Adverbly adjectives!
                group = group[(group['pos'] == 'ADV') & (group['next_pos'] == 'ADJ')]
                
                pair = group['token'].str.cat(group['next_token'], sep=' ').rename('pair')
                
                genre_df = pd.concat([genre_df, pair], axis=0)
            
            genre_df = genre_df.rename(columns={0 : 'pairs'})
            genre = re.sub('/', '', re.sub(' ', '_', genre))
            
            final_df = genre_df['pairs'].value_counts().reset_index()
            final_df['freqmil'] = (final_df['pairs'] / num_tokens) * 1000000
            
            final_df.to_csv(genrespath + '/' + genre + '_pairs.csv', header=['pair', 'counts', 'freqmil'], index=None)
                
    num_tokens = len(df.index)
    print(corpusname, num_tokens)

    grouped = df.groupby('text_num')

    df = pd.DataFrame()

    for name,group in grouped:
        next_pos = group['pos'][1:].reset_index(drop=True).rename('next_pos')
        next_token = group['token'][1:].reset_index(drop=True).rename('next_token')

        group = group.reset_index(drop=True)

        # Throw out last word
        group = pd.concat([group, next_pos, next_token], axis=1).dropna()

        # Keep only fully alphabetical words
        group = group[group['token'].str.isalpha() & group['next_token'].str.isalpha()]

        # Keep only the pairs where the first token ends in ly
        group = group[group['token'].str.endswith('ly')]

        # Throw out 'only'
        group = group[group['token'] != 'only']

        # Adverbly adjectives!
        group = group[(group['pos'] == 'ADV') & (group['next_pos'] == 'ADJ')]

        pair = group['token'].str.cat(group['next_token'], sep=' ').rename('pair')

        df = pd.concat([df, pair], axis=0)

    df = df.rename(columns={0 : 'pairs'})
    final_df = df['pairs'].value_counts().reset_index()
    final_df['freqmil'] = (final_df['pairs'] / num_tokens) * 1000000

    final_df.to_csv(corpuspath + '/' + 'all_pairs.csv', header=['pair', 'counts', 'freqmil'], index=None)

In [8]:
path = 'pos/'
suffix = '_pos.csv'

files = [ path+f for f in os.listdir(path) if f.endswith(suffix)]

for f in files:
    corpusname = f.split(path)[1].split(suffix)[0]
    df = pd.read_csv(f)
    
    df['token'] = df['token'].str.lower()
    
    print(df.head())
    
    if 'genre' in df.columns:
        extractAdverblyAdjectives(corpusname, df['text_num'], df['token'], df['pos'], df['genre'], bygenre=True)
    else:
        extractAdverblyAdjectives(corpusname, df['text_num'], df['token'], df['pos'], bygenre=False)

   Unnamed: 0  text_num  token    pos
0           0         1  magoo   NOUN
1           1         1      '  PUNCT
2           2         1    was   VERB
3           3         1     by    ADP
4           4         1    far    ADV
Cornell
Cornell 1496086


## Specially dealing with COCA

In [6]:
genre_dict = {
    'spok' : 'Spoken',
    'mag' : 'Magazine',
    'acad' : 'Academic',
    'news' : 'News',
    'fic' : 'Fiction'
}

In [7]:
pos_path = 'COCA/POS/'
zipfiles = [pos_path+f for f in os.listdir(pos_path) if f.endswith('.zip')]
print(zipfiles)

['COCA/POS/acad.zip', 'COCA/POS/news.zip', 'COCA/POS/fic.zip', 'COCA/POS/mag.zip', 'COCA/POS/spok.zip']


In [8]:
def deleteDir(dir):
    try:
        for f in os.listdir(dir):
            os.remove(dir+f)
        os.rmdir(dir)
    except:
        pass

genre_count = defaultdict(int)

tmp_dir = '__tmp/'

deleteDir(tmp_dir)

for f in zipfiles:
    os.makedirs(tmp_dir)
    
    genre = genre_dict[f.split('/')[-1].split('.zip')[0]]
    with ZipFile(f, 'r') as zip_f:
        zip_f.extractall(tmp_dir)
    
    lines = 0
    for f in os.listdir(tmp_dir):
        with open(tmp_dir+f, 'r', encoding='latin-1') as fo:
            for l in fo:
                if not l.startswith('##'):
                    lines += 1
    
    genre_count[genre] = lines
    
    print(genre, genre_count[genre])
    deleteDir(tmp_dir)

Academic 120484732
News 125553380
Fiction 125360785
Magazine 127689724
Spoken 131059025


In [9]:
advadj_path = 'COCA/all_years/'
corpusname = 'COCA'

advadj_files = [advadj_path+f for f in os.listdir(advadj_path) if f.endswith('.csv')]

for f in advadj_files:
    allGenres = False
    
    split = f.split('/')[-1].split('.csv')[0].split('_')
    if len(split) == 3:
        genre = genre_dict[split[0]]
    else:
        allGenres = True
    
    df = pd.read_csv(f).rename(columns={'freq' : 'counts'})
    
    num_tokens = -1
    if allGenres:
        num_tokens = sum([genre_count[g] for g in genre_count])
    else:
        num_tokens = genre_count[genre]
    
    df['freqmil'] = (df['counts'] / num_tokens) * 1000000
    
    df.sort_values('counts', ascending=False, inplace=True)
    
    # Keep only fully alphabetical words
    df = df[df['pair'].str.match('[a-zA-Z ]*')]

    # Throw out 'only'
    df = df[~ df['pair'].str.startswith('only ')]
    
    corpuspath = basepath + '/' + corpusname 
    os.makedirs(corpuspath, exist_ok=True)
    
    genrespath = corpuspath + '/' + 'genres'
    os.makedirs(genrespath, exist_ok=True)
    
    if allGenres:
        df.to_csv(corpuspath + '/' + 'all_pairs.csv', header=['pair', 'counts', 'freqmil'], index=None)
    else:
        print(genre)
        df.to_csv(genrespath + '/' + genre + '_pairs.csv', header=['pair', 'counts', 'freqmil'], index=None)

Fiction
News
Magazine
Academic
Spoken
