In [20]:
import os, codecs, scipy.stats, csv

In [21]:
metadata = "/Users/eunjilee/Desktop/cesta/short_stories_project/metadata_files/metadata_shortstories_pos.csv"
sdir = "/Users/eunjilee/Desktop/cesta/short_stories_project/pos_tagged"
ofn = "/Users/eunjilee/Desktop/cesta/short_stories_project/outputs/pos_tagged_mdpos.tsv"
pos_tag_file = "/Users/eunjilee/Desktop/cesta/short_stories_project/pos_tagged"

In [22]:
# Turn a spreadsheet into a list of lists

def sheet2lol(fn,separator=","):
    f=codecs.open(fn,encoding="utf-8-sig")
    text=f.read()
    f.close()
    lines = text.splitlines()
    lol = []
    for l in lines:
        l = l.split(separator)
        lol.append(l)
    return lol

In [23]:
# Turn a directory into a list of its text files
# Your files must all end with .txt

def dir2files(somedirectory,path=False, extension = '.csv'):
    if path == False:
        files = os.listdir(somedirectory)
    else:
        files = os.listdir(somedirectory)
        files = [os.path.join(somedirectory,f) for f in files]
    for i in files[:]:
            if not i.endswith(extension):
                files.remove(i)
    return files

In [24]:
# Clean words; you may wish to adjust the parameters here
# Usually want to make lowercase, but might not with names

def cleanpos(pos, lower=True):
    if pos.endswith("$"):
        pos = pos[:-2]
    return pos

In [25]:
# Turn a filename into a cleaned up list of its words

def file2cleanpos(filename, separator = "\t", specific_pos = True):
    pos_lol = sheet2lol(filename, separator)

    if specific_pos == True:
        col_num = 2
    else:
        col_num = 1

    text = []

    for a_list in pos_lol:
        text.append(a_list[col_num])

    return text

In [26]:
# Get the unique values in a spreadsheet column
# (Actually a list of lists now, but same idea)

def unique_col_values(somedf,col_num,header=True):
    values = []
    if header == True:
        somedf = somedf[1:]
    for row in somedf:
        v = row[col_num]
        if v not in values:
            values.append(v)
    return values

In [27]:
# Turn your corpora into wordcount dictionaries
# This will be a dictionary of dictionaries
# For this to work as is, your metadata table must have filenames but not path names

def makecorpusdicts(somedir,metatable,meta_col_num,fn_col_num):
    corp_dict = {}
    files = dir2files(somedir, extension = ".tsv")
    corpora = unique_col_values(metatable,meta_col_num)


    for c in corpora:
        corp_dict[c] = {}
    for row in metatable[1:]:
        fn = os.path.join(somedir,row[fn_col_num])
        tags = file2cleanpos(fn)

        cd = str(row[meta_col_num])
        for tag in tags:
            if tag in corp_dict[cd]:
                corp_dict[cd][tag] += 1
            else:
                corp_dict[cd][tag] = 1

    return corp_dict

In [28]:
# Get pos counts for your copora from the "makecorpusdicts" output
# This would work for any such dictionary of dictionaries

def get_pos_dict(corp_dict):
    pos_dict = {}
    for d in corp_dict:
        tempd = dict(corp_dict[d])
        pos_dict[d] = sum(tempd.values())

    return pos_dict

In [29]:
# Turn a directory of text files into an overall dictionary of word counts

def dir2counts(somedir):
    counts = {}
    files = dir2files(somedir, path=True, extension = '.tsv')

    for f in files:
        tags = file2cleanpos(f)
        for tag in tags:
            if tag in counts:
                counts[tag] += 1
            else:
                counts[tag] = 1

    return counts

In [30]:
# For MDPOS, we want POS rates
# I.e. how often a POS appears given the word count
# This converts a counts dictionary to a rates dictionary

def counts2rates(somecountdict):
    rates = dict(somecountdict)
    total = sum(somecountdict.values())
    for key in rates:
        rates[key] = float(rates[key])/total
    return rates

In [31]:
# Make your dictionaries a term document matrix
# This gives you a good way to A) produce writable output
# And B), go through and get MDW later on
# min_obs refers to total observations across all corpora

def dicts2tdm(dictofcountdicts, min_obs=0, no_numbers=False):
    # I'm removing any corpus that had no actual values found; might not want to do this
    for d in dictofcountdicts:
        if sum(dictofcountdicts[d].values()) == 0:
            del dictofcountdicts[d]
    tdm = [['token_']+list(dictofcountdicts.keys())]
    all_pos = []
    for d in dictofcountdicts:
        for w in list(dictofcountdicts[d].keys()):
            if w not in all_pos:
                all_pos.append(w)
    for w in all_pos:
        row=[w]
        for col in tdm[0][1:]:
            if w in dictofcountdicts[col]:
                row.append(dictofcountdicts[col][w])
            else:
                row.append(0)
        tdm.append(row)
    if no_numbers == True:
        for row in tdm[1:]:
            if is_number(row[0]):
                tdm.remove(row)
    for row in tdm[1:]:
        if sum(row[1:]) < min_obs:
            tdm.remove(row)
    return tdm

In [32]:
# 'Melts' the tdm
# Which means columns are types of data instead of particular instances of that type
# E.g. instead of columns for "Western" and "Sci-Fi" you'd have "Genre" and list the two types under it for each word

def tdm_melter(sometdm):
    old_headers = sometdm[0]
    melt = [['token_','Corpus','Observations']]
    for row in sometdm[1:]:
        for n,col in enumerate(row[1:],1):
            ol = [row[0],old_headers[n],row[n]]
            melt.append(ol)
    return melt

In [33]:
# Do a Fishers exact test, mdpos style

def get_fishers(melted_tdm_row, wcd, word_rates, obs_exp=False, alternative="greater"):
    corpus = melted_tdm_row[1]
    wc = wcd[corpus]
    word = melted_tdm_row[0]
    rate = word_rates[word]
    a = melted_tdm_row[2]
    b = wc-a
    c = round(rate*wc)
    d = wc-c
    p = scipy.stats.fisher_exact([[a,b],[c,d]],alternative=alternative)[1]
    if obs_exp == True:
        if c != 0:
            oe = a/c
        else:
            oe = "Inf"
        p = (p,oe)
    return p


In [34]:
# Get the actual mdw data and append to the tdm

def add_mdw(melted_tdm,wcd,word_rates,obs_exp=True,alpha=.05,alternative="greater"):
    melted_tdm[0].extend(['p_value','Obs/Exp'])
    for row in melted_tdm[1:]:
        duple = get_fishers(row,wcd,word_rates,obs_exp=obs_exp,alternative=alternative)
        p = duple[0]
        if p >= alpha:
            melted_tdm.remove(row)
        else:
            row.extend([p,duple[1]])
    return melted_tdm

In [35]:
# Turns our list of lists into a spreadsheet, located wherever you put it
# tab separation

def lol_to_file(lol,output_filename,separator="\t"):
    pos_list = []
    
    with open(output_filename,'w') as output_file:
        for row in lol:
            row = [str(i) for i in row]
            ostr = "\t".join(row) + "\n"
            output_file.write(ostr)
            
            pos_list.append(row[0])
            #poslist_to_wordlist(pos_list)
    print(pos_list)
    print ("Wrote the file " + output_filename)

In [36]:
def poslist_to_wordlist(pos_list):
    files = dir2files(pos_tag_file, path = True, extension = '.tsv')
    print(files)
    for file in files:
        with open(file) as f:
            reader = csv.DictReader(f, dialect='excel-tab')
            print(row)
            

In [37]:
# Otherwise, you have to write something

def get_mdpos(metadata_table_fn,metadata_column_num,filename_col_num,source_directory,
              output_filename,metadata_table_separator=",",keep_uppercase=False,minimum_observations=0,
              fishers_alternative="greater",alpha=.05,output_filename_separator="\t"):
    print ("Reading metadata table...")
    metatable = sheet2lol(metadata_table_fn,separator=metadata_table_separator)
    
    print ("Making a wordcount dictionary for each subcorpus...")
    corp_dict = makecorpusdicts(source_directory,metatable,meta_col_num = metadata_column_num,
                                fn_col_num = filename_col_num)
    
    print ("Getting total word counts for your subcorpora...")
    wcd = get_pos_dict(corp_dict)
    
    print ("Getting word counts for the full corpus...")
    counts = dir2counts(source_directory)
    
    print ("Coverting those counts to rates...")
    rates = counts2rates(counts)
    
    print ("Making a tdm out of the individual corpus counts...")
    tdm = dicts2tdm(corp_dict,min_obs=minimum_observations)
    
    print ("Melting the tdm...")
    melted_tdm = tdm_melter(tdm)
    
    print ("Getting mdw data...")
    mdw = add_mdw(melted_tdm,wcd,rates,alpha=alpha,alternative=fishers_alternative)
    
    print ("Writing data to file...")
    lol_to_file(mdw,output_filename)

In [38]:
get_mdpos(metadata,metadata_column_num=1,filename_col_num=0,source_directory=sdir,
          output_filename=ofn,minimum_observations=10)

Reading metadata table...
Making a wordcount dictionary for each subcorpus...
Getting total word counts for your subcorpora...
Getting word counts for the full corpus...
Coverting those counts to rates...
Making a tdm out of the individual corpus counts...
Melting the tdm...
Getting mdw data...
Writing data to file...
['token_', 'RB', 'IN', 'DT', 'NN', 'VBD', ',', 'PRP', 'VB', 'JJ', 'HYPH', 'NNP', '.', 'PRP$', 'NNS', 'CC', 'VBG', 'VBN', 'VBZ', 'TO', 'EX', '``', 'UH', "''", 'VBP', 'POS', 'XX', 'JJS']
Wrote the file /Users/eunjilee/Desktop/cesta/short_stories_project/outputs/pos_tagged_mdpos.tsv
