In [5]:
import os, re
import csv
import numpy as np
import pandas as pd
from collections import defaultdict
from zipfile import ZipFile

In [6]:
basepath = 'adverbly_adjectives'
os.makedirs(basepath, exist_ok=True)

In [7]:
def extractAdverblyAdjectives(corpusname, textnums, tokens, pos, genres=pd.DataFrame(), decades=pd.DataFrame(), bygenre=True, bydecade=False):
    assert tokens.shape == textnums.shape, "Tokens must be the same length as the text numbers"
    assert textnums.shape == pos.shape, "Parts of speech must be the same length as the text numbers"
    if bygenre:
        assert genres.shape == tokens.shape, "Genres must be the same length as the number of texts"
    if bydecade:
        assert decades.shape == tokens.shape, "Decades must be the same length as the number of texts"
    
    print(corpusname)
    
    corpuspath = basepath + '/' + corpusname 
    os.makedirs(corpuspath, exist_ok=True)
    
    if bygenre:
        df = pd.concat([genres, textnums, tokens, pos], axis=1)
        print(df.head())
    else:
        df = pd.concat([textnums, tokens, pos], axis=1)
    
    if bygenre:
        
        genrespath = corpuspath + '/' + 'genres'
        os.makedirs(genrespath, exist_ok=True)
        
        genre_grouped = df.groupby('genre')
        
        for genre,genre_group in genre_grouped:
            num_tokens = len(genre_group.index)
            print(genre, num_tokens)
            
            grouped = genre_group.groupby('text_num')
            
            genre_df = pd.DataFrame()

            for name,group in grouped:
                next_pos = group['pos'][1:].reset_index(drop=True).rename('next_pos')
                next_token = group['token'][1:].reset_index(drop=True).rename('next_token')
        
                group = group.reset_index(drop=True)

                # Throw out last word
                group = pd.concat([group, next_pos, next_token], axis=1).dropna()
                
                # Keep only fully alphabetical words
                group = group[group['token'].str.isalpha() & group['next_token'].str.isalpha()]
                
                # Keep only the pairs where the first token ends in ly
                group = group[group['token'].str.endswith('ly')]
                
                # Throw out 'only'
                group = group[group['token'] != 'only']
                
                # Adverbly adjectives!
                group = group[(group['pos'] == 'ADV') & (group['next_pos'] == 'ADJ')]
                
                pair = group['token'].str.cat(group['next_token'], sep=' ').rename('pair')
                
                genre_df = pd.concat([genre_df, pair], axis=0)
            
            genre_df = genre_df.rename(columns={0 : 'pairs'})
            genre = re.sub('/', '', re.sub(' ', '_', genre))
            
            final_df = genre_df['pairs'].value_counts().reset_index()
            final_df['freqmil'] = (final_df['pairs'] / num_tokens) * 1000000
            
            final_df.to_csv(genrespath + '/' + genre + '_pairs.csv', header=['pair', 'counts', 'freqmil'], index=None)
                
    num_tokens = len(df.index)
    print(corpusname, num_tokens)

    grouped = df.groupby('text_num')

    df = pd.DataFrame()

    for name,group in grouped:
        next_pos = group['pos'][1:].reset_index(drop=True).rename('next_pos')
        next_token = group['token'][1:].reset_index(drop=True).rename('next_token')

        group = group.reset_index(drop=True)

        # Throw out last word
        group = pd.concat([group, next_pos, next_token], axis=1).dropna()

        # Keep only fully alphabetical words
        group = group[group['token'].str.isalpha() & group['next_token'].str.isalpha()]

        # Keep only the pairs where the first token ends in ly
        group = group[group['token'].str.endswith('ly')]

        # Throw out 'only'
        group = group[group['token'] != 'only']

        # Adverbly adjectives!
        group = group[(group['pos'] == 'ADV') & (group['next_pos'] == 'ADJ')]

        pair = group['token'].str.cat(group['next_token'], sep=' ').rename('pair')

        df = pd.concat([df, pair], axis=0)

    df = df.rename(columns={0 : 'pairs'})
    final_df = df['pairs'].value_counts().reset_index()
    final_df['freqmil'] = (final_df['pairs'] / num_tokens) * 1000000

    final_df.to_csv(corpuspath + '/' + 'all_pairs.csv', header=['pair', 'counts', 'freqmil'], index=None)
    
    if bydecade:
        df = pd.concat([decades, textnums, tokens, pos], axis=1)
#         print(df.head())
        
        decadespath = corpuspath + '/' + 'decades'
        os.makedirs(decadespath, exist_ok=True)
        
        decade_grouped = df.groupby('decade')
        
        for decade,decade_group in decade_grouped:
            num_tokens = len(decade_group.index)
            print(decade, num_tokens)
            
            grouped = decade_group.groupby('text_num')
            
            decade_df = pd.DataFrame()

            for name,group in grouped:
                next_pos = group['pos'][1:].reset_index(drop=True).rename('next_pos')
                next_token = group['token'][1:].reset_index(drop=True).rename('next_token')
        
                group = group.reset_index(drop=True)

                # Throw out last word
                group = pd.concat([group, next_pos, next_token], axis=1).dropna()
                
                # Keep only fully alphabetical words
                group = group[group['token'].str.isalpha() & group['next_token'].str.isalpha()]
                
                # Keep only the pairs where the first token ends in ly
                group = group[group['token'].str.endswith('ly')]
                
                # Throw out 'only'
                group = group[group['token'] != 'only']
                
                # Adverbly adjectives!
                group = group[(group['pos'] == 'ADV') & (group['next_pos'] == 'ADJ')]
                
                pair = group['token'].str.cat(group['next_token'], sep=' ').rename('pair')
                
                decade_df = pd.concat([decade_df, pair], axis=0)
            
            decade_df = decade_df.rename(columns={0 : 'pairs'})
            decade = re.sub('/', '', re.sub(' ', '_', str(decade)))
            
            final_df = decade_df['pairs'].value_counts().reset_index()
            final_df['freqmil'] = (final_df['pairs'] / num_tokens) * 1000000
            
            final_df.to_csv(decadespath + '/' + decade + '_pairs.csv', header=['pair', 'counts', 'freqmil'], index=None)

In [9]:
path = 'pos/'
suffix = '_pos.csv'

files = [ path+f for f in os.listdir(path) if f.endswith(suffix)]

for f in files:
    corpusname = f.split(path)[1].split(suffix)[0]
    df = pd.read_csv(f)
    
    df['token'] = df['token'].str.lower()
    
    print(df.head())
    
    if 'genre' in df.columns:
        extractAdverblyAdjectives(corpusname, df['text_num'], df['token'], df['pos'], df['genre'],
                                  bygenre=True, bydecade=False)
        if 'decade' in df.columns:
            extractAdverblyAdjectives(corpusname, df['text_num'], df['token'], df['pos'], df['genre'], df['decade'],
                                      bygenre=True, bydecade=True)
    elif 'decade' in df.columns:
            extractAdverblyAdjectives(corpusname, df['text_num'], df['token'], df['pos'], decades=df['decade'],
                                      bygenre=False, bydecade=True)
    else:
        extractAdverblyAdjectives(corpusname, df['text_num'], df['token'], df['pos'],
                                 bygenre=False, bydecade=False)

   decade  text_num       token    pos
0    1948         0       greed  PROPN
1    1948         0           ,  PUNCT
2    1948         0           a    DET
3    1948         0  despicable    ADJ
4    1948         0     passion   NOUN
NYT
NYT 20575297
1915 795
1916 983
1921 1162
1923 391
1924 48614
1925 202222
1926 119628
1927 102127
1928 167784
1929 193270
1930 173256
1931 215076
1932 250815
1933 239777
1934 213126
1935 194373
1936 224176
1937 69146
1938 64737
1939 169201
1940 185330
1941 149091
1942 134011
1943 111442
1944 121652
1945 76093
1946 90088
1947 134190
1948 140651
1949 174168
1950 129915
1951 123651
1952 171599
1953 162222
1954 141405
1955 121238
1956 101544
1957 114714
1958 139102
1959 136992
1960 115790
1961 115837
1962 105139
1963 100476
1964 135969
1965 120305
1966 98243
1967 104932
1968 145024
1969 163157
1970 107072
1971 154019
1972 123073
1973 117898
1974 96426
1975 82532
1976 101426
1977 109918
1978 123407
1979 192496
1980 10573
1981 187622
1982 215695
1983 211342
1

## Specially dealing with COCA

In [None]:
genre_dict = {
    'spok' : 'Spoken',
    'mag' : 'Magazine',
    'acad' : 'Academic',
    'news' : 'News',
    'fic' : 'Fiction'
}

In [None]:
pos_path = 'COCA/POS/'
zipfiles = [pos_path+f for f in os.listdir(pos_path) if f.endswith('.zip')]
print(zipfiles)

In [None]:
def deleteDir(dir):
    try:
        for f in os.listdir(dir):
            os.remove(dir+f)
        os.rmdir(dir)
    except:
        pass

genre_count = defaultdict(int)

tmp_dir = '__tmp/'

deleteDir(tmp_dir)

for f in zipfiles:
    os.makedirs(tmp_dir)
    
    genre = genre_dict[f.split('/')[-1].split('.zip')[0]]
    with ZipFile(f, 'r') as zip_f:
        zip_f.extractall(tmp_dir)
    
    lines = 0
    for f in os.listdir(tmp_dir):
        with open(tmp_dir+f, 'r', encoding='latin-1') as fo:
            for l in fo:
                if not l.startswith('##'):
                    lines += 1
    
    genre_count[genre] = lines
    
    print(genre, genre_count[genre])
    deleteDir(tmp_dir)

In [None]:
advadj_path = 'COCA/all_years/'
corpusname = 'COCA'

advadj_files = [advadj_path+f for f in os.listdir(advadj_path) if f.endswith('.csv')]

for f in advadj_files:
    allGenres = False
    
    split = f.split('/')[-1].split('.csv')[0].split('_')
    if len(split) == 3:
        genre = genre_dict[split[0]]
    else:
        allGenres = True
    
    df = pd.read_csv(f).rename(columns={'freq' : 'counts'})
    
    num_tokens = -1
    if allGenres:
        num_tokens = sum([genre_count[g] for g in genre_count])
    else:
        num_tokens = genre_count[genre]
    
    df['freqmil'] = (df['counts'] / num_tokens) * 1000000
    
    df.sort_values('counts', ascending=False, inplace=True)
    
    # Keep only fully alphabetical words
    df = df[df['pair'].str.match('[a-zA-Z ]*')]

    # Throw out 'only'
    df = df[~ df['pair'].str.startswith('only ')]
    
    corpuspath = basepath + '/' + corpusname 
    os.makedirs(corpuspath, exist_ok=True)
    
    genrespath = corpuspath + '/' + 'genres'
    os.makedirs(genrespath, exist_ok=True)
    
    if allGenres:
        df.to_csv(corpuspath + '/' + 'all_pairs.csv', header=['pair', 'counts', 'freqmil'], index=None)
    else:
        print(genre)
        df.to_csv(genrespath + '/' + genre + '_pairs.csv', header=['pair', 'counts', 'freqmil'], index=None)