In [1]:
import pandas as pd
import os
import csv
import time
import sys
from collections import Counter
import re

pd.options.mode.chained_assignment = None


In [2]:
target_verbs = ["strike", "whack", "hit", "rub", "poke", "bop", "smack", "clean", "tease", "feed", "scuff", "pinch", "knock", "pat", "locate", "feel", "spot", "point", "pet", "look", "squeeze", "pick", "cuddle", "find", "hug", "select", "choose"]

In [3]:

def row2dict(date_row):
    date_row = date_row.dropna()
    date_dict = {}
    for item in date_row.values:
        split_dates = item.split(",")
        date_dict[split_dates[0]] = int(split_dates[1])
    return date_dict


def sum_dicts(set_of_dicts):
    # print(set_of_dicts)
    initial = Counter(set_of_dicts[0])
    for i in range(1,len(set_of_dicts)):
        initial = initial + Counter(set_of_dicts[i])
    return sorted(initial.items())

def check_bias_alt(arg_structure):
    # print(arg_structure)
    
    holder = []
    root_index = -1
    connection_target_index_word = "with"
    with_index = -1
    connection_target_index = -1

    counter = 1
    for word in arg_structure.split(" "):
        temp = word.split("/")
        if temp[0] == "with":
            connection_target_index = int(temp[3])
            with_index = counter
        if temp[3] == "0":
            root_index = counter
            root_verb = temp[0]
        else:
            counter = counter + 1
        holder.append(temp)

    if (re.search("(?="+root_verb+")(?=.*with/)", arg_structure) == None):
        return "no_with"
    elif (root_index == with_index): # If root verb is not followed by "with"
        return "neither"
    if root_index == connection_target_index:
        # print("Instrument bias")
        return "instrument"
    else:
        # print("Modifier bias")
        return "modifier"

In [8]:

csv.field_size_limit(sys.maxsize)
col_names = ["verb", "arg_structure", "count"]
result = pd.DataFrame()

syntgram_dir = "../data/filtered_triarcs"
filename = "triarcs.demo.tsv"
raw_data = []

with open(syntgram_dir + "/" + filename) as f:
    reader = csv.reader(f, delimiter='\t', quotechar = None, doublequote= True)
    raw_data = [r for r in reader]
    
df = pd.DataFrame(data=raw_data)



In [9]:
df = df[df[0].isin(target_verbs)]
print(df)

     0                                                  1   2        3   \
0   hit  improved/VBN/amod/2 grab/NNP/advcl/3 hit/VB/RO...  39   2002,8   
1   hit  in/IN/prep/2 hit/VBD/ROOT/0 him/PRP/dobj/2 wit...  13   1995,2   
2   hit  in/IN/prep/2 hit/VBD/ROOT/0 on/IN/prep/2 nose/...  13   1995,2   
3   hit  in/IN/prep/2 hit/VBD/ROOT/0 with/IN/prep/2 han...  13   1995,2   
4   hit  in/IN/prep/3 1927/CD/pobj/1 hit/VBD/ROOT/0 six...  13   1968,5   
5   hit  in/IN/prep/3 1927/CD/pobj/1 hit/VBD/ROOT/0 wit...  13   1968,5   
6   hit  in/IN/prep/3 1974/CD/pobj/1 hit/VBD/ROOT/0 eco...  11  2007,11   
7   hit  in/IN/prep/3 1974/CD/pobj/1 hit/VBD/ROOT/0 wit...  11  2007,11   
8   hit  in/IN/prep/3 addition/NN/pobj/1 hit/VBN/ROOT/0...  12   1979,4   
9   hit  in/IN/prep/3 addition/NN/pobj/1 hit/VBN/ROOT/0...  12   1979,4   
10  hit  in/IN/prep/3 adulation/NN/pobj/1 believes/VBZ/...  10   1960,6   
11  hit  in/IN/prep/3 asking/VBG/pcomp/1 hit/VBD/ROOT/0...  26   1871,1   
12  hit  in/IN/prep/3 bal

In [10]:

dates = df.loc[:, ~ df.columns.isin([0,1,2])]

date_row = dates.loc[0,:]
dates["counts"] = dates.apply(lambda row: row2dict(row), axis = 1)

In [11]:
df = df[[0,1,2]]
df.columns = ["verb", "arg_structure", "count"]
df["count"] = df["count"].astype(int)
df["year_count"] = dates["counts"]

print(df)

   verb                                      arg_structure  count  \
0   hit  improved/VBN/amod/2 grab/NNP/advcl/3 hit/VB/RO...     39   
1   hit  in/IN/prep/2 hit/VBD/ROOT/0 him/PRP/dobj/2 wit...     13   
2   hit  in/IN/prep/2 hit/VBD/ROOT/0 on/IN/prep/2 nose/...     13   
3   hit  in/IN/prep/2 hit/VBD/ROOT/0 with/IN/prep/2 han...     13   
4   hit  in/IN/prep/3 1927/CD/pobj/1 hit/VBD/ROOT/0 six...     13   
5   hit  in/IN/prep/3 1927/CD/pobj/1 hit/VBD/ROOT/0 wit...     13   
6   hit  in/IN/prep/3 1974/CD/pobj/1 hit/VBD/ROOT/0 eco...     11   
7   hit  in/IN/prep/3 1974/CD/pobj/1 hit/VBD/ROOT/0 wit...     11   
8   hit  in/IN/prep/3 addition/NN/pobj/1 hit/VBN/ROOT/0...     12   
9   hit  in/IN/prep/3 addition/NN/pobj/1 hit/VBN/ROOT/0...     12   
10  hit  in/IN/prep/3 adulation/NN/pobj/1 believes/VBZ/...     10   
11  hit  in/IN/prep/3 asking/VBG/pcomp/1 hit/VBD/ROOT/0...     26   
12  hit  in/IN/prep/3 ball/NN/pobj/1 hit/VBN/ROOT/0 wit...     12   
13  hit  in/IN/prep/3 beginning/NN

In [12]:
df["bias"] = df["arg_structure"].apply(check_bias_alt)

In [13]:
print(df)

   verb                                      arg_structure  count  \
0   hit  improved/VBN/amod/2 grab/NNP/advcl/3 hit/VB/RO...     39   
1   hit  in/IN/prep/2 hit/VBD/ROOT/0 him/PRP/dobj/2 wit...     13   
2   hit  in/IN/prep/2 hit/VBD/ROOT/0 on/IN/prep/2 nose/...     13   
3   hit  in/IN/prep/2 hit/VBD/ROOT/0 with/IN/prep/2 han...     13   
4   hit  in/IN/prep/3 1927/CD/pobj/1 hit/VBD/ROOT/0 six...     13   
5   hit  in/IN/prep/3 1927/CD/pobj/1 hit/VBD/ROOT/0 wit...     13   
6   hit  in/IN/prep/3 1974/CD/pobj/1 hit/VBD/ROOT/0 eco...     11   
7   hit  in/IN/prep/3 1974/CD/pobj/1 hit/VBD/ROOT/0 wit...     11   
8   hit  in/IN/prep/3 addition/NN/pobj/1 hit/VBN/ROOT/0...     12   
9   hit  in/IN/prep/3 addition/NN/pobj/1 hit/VBN/ROOT/0...     12   
10  hit  in/IN/prep/3 adulation/NN/pobj/1 believes/VBZ/...     10   
11  hit  in/IN/prep/3 asking/VBG/pcomp/1 hit/VBD/ROOT/0...     26   
12  hit  in/IN/prep/3 ball/NN/pobj/1 hit/VBN/ROOT/0 wit...     12   
13  hit  in/IN/prep/3 beginning/NN

In [12]:
def check_bias_alt(arg_structure):
#     print(arg_structure)
    
    holder = []
    root_index = -1
    connection_word = "with"
    connection_index = -1
    connection = -1

    counter = 1
    for word in arg_structure.split(" "):
        temp = word.split("/")
        if temp[0] == "with":
            connection = int(temp[3])
            connection_index = counter
        if temp[3] == "0":
            root_index = counter
            root_verb = temp[0]
        else:
            counter = counter + 1
        holder.append(temp)

    # print(root_verb, root_index)
    # print(connection_word, connection)
    
#     print(root_verb)
#     print(re.search("(?="+root_verb+")(?=.*with/)", arg_structure) == None)
#     print((root_index == connection_index))
#     print(root_index, connection_index)
#     print()
    if (re.search("(?="+root_verb+")(?=.*with/)", arg_structure) == None):
        return "no_with"
    elif (root_index == connection_index): # If root verb is not followed by "with"
        return "neither"
    if root_index == connection:
        # print("Instrument bias")
        return "instrument"
    else:
        # print("Modifier bias")
        return "modifier"

In [13]:
# check_bias_alt(instrument)

In [14]:
# check_bias_alt(sample[1][0])

In [15]:
df_subset["bias"] = df_subset["arg_structure"].apply(check_bias_alt)

In [16]:
# df_subset["modifier"] = df_subset.bias.str.contains("modifier")
# df_subset["instrument"] = df_subset.bias.str.contains("instrument")
# df_subset["neither"] = df_subset.bias.str.contains("neither")
# df_subset["no_with"] = df_subset.bias.str.contains("no_with")
# df_subset["check"] = df_subset["modifier"] + df_subset["instrument"] + df_subset["neither"] + df_subset["no_with"]

In [17]:
print(df_subset)

        verb                                      arg_structure  count  \
0       find  '/CC/cc/2 find/VB/ROOT/0 sent/VBD/parataxis/2 ...     11   
1       find  '/CC/cc/3 but/CC/cc/3 find/VB/ROOT/0 within/IN...     11   
2       find  '/CC/cc/3 you/PRP/nsubj/3 find/VB/ROOT/0 withi...     11   
3       find  '/CC/dep/2 find/VB/dep/0 's/VBZ/ccomp/2 withou...     11   
4       find  '/IN/prep/3 may/NNP/pobj/1 find/VBP/ccomp/0 fa...     10   
...      ...                                                ...    ...   
159360  find  youth/NN/nsubj/4 like/IN/prep/1 hurry/NNP/pobj...     38   
159361  find  youth/NN/nsubj/5 with/IN/prep/1 fair/NN/pobj/2...     10   
159362  find  youths/NNS/nsubj/2 find/VBP/ROOT/0 identify/VB...     10   
159363  find  zemindar/NN/nsubj/2 find/VB/advcl/0 grain/NN/d...     14   
159364  find  zemindar/NN/nsubj/2 find/VB/advcl/0 within/IN/...     13   

                                               year_count        bias  \
0       {'1887': 1, '1962': 1, '1980':

In [18]:
# df_subset.to_csv("../data_output/test.csv", index = False)

In [19]:
# a = "'/CC/cc/2 find/VB/ROOT/0 sent/VBD/parataxis/2 with/IN/prep/3 load/NN/pobj/4"

# print(re.search("(?=find)(?=.*with/)", a) == None)

In [20]:
# Check if structure is [verb, "with"]

# Check that "with" is not directly after verb

# Check attachment of "with"

In [21]:
# df_type = df_subset[["verb","modifier","instrument","neither","no_with"]]
# for col in ["modifier","instrument","neither","no_with"]:
#     df_type[col] = df_type[col].replace({True: 1, False: 0})
# df_type = df_type.set_index("verb")

In [22]:
# df_type = df_type.idxmax(axis=1)
# df_subset["alternation"] = df_type.values

In [25]:
result = pd.DataFrame()

for verb in pd.unique(df_subset.verb):
    for bias in pd.unique(df_subset.bias):
        print(verb, bias)
        temp = df_subset[(df_subset["verb"] == verb) & (df_subset["bias"] == bias)]
        if len(temp) != 0:
            print(temp)
            total_count = temp["count"].sum()
            year_counts = sum_dicts(temp["year_count"].values)
        else:
            total_count = 0
            year_counts = -1

        new_row = {"verb":[verb], "bias":[bias], "total_count": [total_count], "year_count":[year_counts]}
        new_row = pd.DataFrame(new_row)
        print()
        result = pd.concat([result, new_row], axis = 0)


In [26]:
print(result)

   verb        bias  total_count  \
0  find    modifier      1035127   
0  find     no_with      1144786   
0  find  instrument       391286   
0  find     neither       403208   

                                          year_count  
0  [(1611, 2), (1644, 1), (1645, 1), (1651, 3), (...  
0  [(1651, 36), (1660, 26), (1663, 16), (1666, 25...  
0  [(1586, 4), (1632, 1), (1648, 2), (1663, 6), (...  
0  [(1586, 7), (1663, 16), (1666, 13), (1711, 1),...  
