In [33]:
import pandas as pd
import ast
from tqdm import tqdm
from nltk.tree import *
import spacy

In [34]:
df = pd.read_csv("../01_data/preprocessedData/random_train_question.csv").drop(["Unnamed: 0"], axis=1)

In [35]:
df["correct_answer_text"] = [ast.literal_eval(liste) for liste in df["correct_answer_text"]]
df["correct_answer_char_index"] = [set(ast.literal_eval(liste)) for liste in df["correct_answer_char_index"]]
df["correct_answer_token_index"] = [set(ast.literal_eval(liste)) for liste in df["correct_answer_token_index"]]
df["paragraph_tokens"] = [set(ast.literal_eval(liste)) for liste in df["paragraph_tokens"]]
df["question_token"] = [set(ast.literal_eval(liste)) for liste in df["question_token"]]

## discard no answers

In [36]:
df = df.loc[(df["correct_answer_text"] != set()) & (df["correct_answer_token_index"] != set())]
len(df)

53827

## WHP-Type & Token

In [37]:
WHP_type=[]
WHP_token=[]

for tree_string in tqdm(df["question_parse_tree"]):
    tree = Tree.fromstring(tree_string)
    subtrees = list(tree.subtrees(filter=lambda x:x.label().startswith("WH")))
    if len(subtrees)>0:
        WHP_type.append(subtrees[0].label())
        WHP_token.append(" ".join(subtrees[0].leaves()).lower())
    else:
        WHP_type.append("None")
        WHP_token.append("None")

df["WHP_type"] = WHP_type
df["WHP_token"] = WHP_token

whp_token_counter_df = df["WHP_token"].value_counts()
threshold = 100
WHP_token_reduced = []
for item in df["WHP_token"]:
    if whp_token_counter_df[item] > threshold:
        WHP_token_reduced.append(item)
    else:
        WHP_token_reduced.append("other")

df["WHP_token_reduced"] =WHP_token_reduced

100%|██████████| 53827/53827 [00:06<00:00, 8878.73it/s] 


## question len

In [38]:
question_len=[]

for question_token in tqdm(df["question_token"]):
    question_len.append(len(question_token))
    
df["question_len"] = question_len

100%|██████████| 53827/53827 [00:00<00:00, 1186335.70it/s]


## New Word ratio
ratio of non stop word that did not appear in the paragraph

In [48]:
from nltk.corpus import stopwords
from unidecode import unidecode
sw = set(stopwords.words("english")).union({"?", " ", ","})

old_token_questions = []
new_token_questions = []
stopwords_questions = []

old_token_ratios = []
new_token_ratios = []
stopwords_ratios = []


for paragraph_token, question_token in tqdm(zip(list(df["paragraph_tokens"]), list(df["question_token"]))):
    question_token = [unidecode(token.lower()) for token in question_token]
    paragraph_token = [unidecode(token.lower()) for token in paragraph_token]
    
    stopwords_question = [token for token in question_token if token in sw]
    old_token_question = [token for token in question_token if (token in paragraph_token) and (not token in sw)]
    new_token_question = [token for token in question_token if (not token in paragraph_token) and (not token in sw)]
    
    old_token_ratios.append(len(old_token_question)/len(question_token))
    new_token_ratios.append(len(new_token_question)/len(question_token))
    stopwords_ratios.append(len(stopwords_question)/len(question_token))
    
    old_token_questions.append(len(old_token_question))
    new_token_questions.append(len(new_token_question))
    stopwords_questions.append(len(stopwords_question))
    
df["old_token_questions"] = old_token_questions
df["new_token_questions"] = new_token_questions
df["stopwords_questions"] = stopwords_questions

df["old_token_ratios"] = old_token_ratios
df["new_token_ratios"] = new_token_ratios
df["stopwords_ratios"] = stopwords_ratios

53827it [00:05, 9778.72it/s] 


## merge_answer_index

In [None]:
import random

merged_answer_indicies = []
for i, row in df.iterrows():
    merged_answer = set()
    merged_answer.add(random.sample(row["correct_answer_token_index"], 1)[0])

    # TODO: There should be a propper merge here :D 
    #for answer_range in row["correct_answer_token_index"]:
        #print("merge")

    merged_answer_indicies.append(merged_answer)
df["merged_answer_token_index"] = merged_answer_indicies

## answer len feature

In [None]:
nlp=spacy.load("en_core_web_sm")
min_answer_len = []
max_answer_len = []

for i, row in tqdm(df.iterrows()):
    answer_docs_len = []
    for start, end in row["correct_answer_token_index"]:
        answer_docs_len.append(end-start)
    min_answer_len.append(min(answer_docs_len))
    max_answer_len.append(max(answer_docs_len))

df["min_answer_len"] = min_answer_len
df["max_answer_len"] = max_answer_len

## answer types

In [10]:
def token_ngram_counter(gram_spans):
    return [tuple(t.text.lower() for t in gram_span) for gram_span in gram_spans]

def first_token_ngram_counter(gram_spans):
    return [tuple(t.text.lower() for t in gram_spans[0])]

def pos_ngram_counter(gram_spans):
    return [tuple(t.pos_ for t in gram_span) for gram_span in gram_spans]

def first_pos_ngram_counter(gram_spans):
    return [tuple(t.pos_ for t in gram_spans[0])]

def tag_ngram_counter(gram_spans):
    return [tuple(t.tag_ for t in gram_span) for gram_span in gram_spans]

def first_tag_ngram_counter(gram_spans):
    return [tuple(t.tag_ for t in gram_spans[0])]

def ent_ngram_counter(gram_spans):
    return [tuple(t.ent_type_ for t in gram_span) for gram_span in gram_spans]

def first_ent_ngram_counter(gram_spans):
    return [tuple(t.ent_type_ for t in gram_spans[0])]

def contains_ner(all_answer_docs):
    result_set = set()
    for answer_doc in all_answer_docs:
        result_set.update([token.ent_iob_ for token in answer_doc])
    
    return result_set != set('O')
    
def contains_ner_type(all_answer_docs):
    result_set = set()
    for answer_doc in all_answer_docs:
        #print([token.text for token in answer_doc], "--->", [token.ent_type_ for token in answer_doc])
        result_set.update([token.ent_type_ for token in answer_doc])
    return result_set

def contains_pos_type(all_answer_docs):
    result_set = set()
    for answer_doc in all_answer_docs:
        #print([token.text for token in answer_doc], "--->", [token.ent_type_ for token in answer_doc])
        result_set.update([token.pos_ for token in answer_doc])
    return result_set

def contains_tag_type(all_answer_docs):
    result_set = set()
    for answer_doc in all_answer_docs:
        #print([token.text for token in answer_doc], "--->", [token.ent_type_ for token in answer_doc])
        result_set.update([token.tag_ for token in answer_doc])
    return result_set

In [11]:
from collections import defaultdict
import textacy

In [12]:
feature_dict = defaultdict(list)

for i, row in tqdm(df.iterrows()):
    feature_dict["counter_answer"].append(len(row["correct_answer_text"]))
    feature_dict["counter_unique_ answer"].append(len(set(row["correct_answer_text"])))
    paragraph_context = nlp(row["paragraph_text"])
    all_answer_docs = []
    for start, end in row["correct_answer_token_index"]:
        all_answer_docs.append(paragraph_context[start:end])
    feature_dict["contains_ner"].append(contains_ner(all_answer_docs))
    feature_dict["contains_ner_type"].append(contains_ner_type(all_answer_docs))
    feature_dict["contains_pos_type"].append(contains_pos_type(all_answer_docs))
    feature_dict["contains_tag_type"].append(contains_tag_type(all_answer_docs))
    #for answer_doc in all_answer_docs:
    #    for i in range(1,3):
    #        gram_spans = list(textacy.extract.ngrams(answer_doc, i, filter_stops=False))
    #        
    #        if len(gram_spans)>0:
    #            feature_dict["counter_{}".format(i)].append(token_ngram_counter(gram_spans))
    #            feature_dict["first_counter_{}".format(i)].append(first_token_ngram_counter(gram_spans))#
    #
    #            feature_dict["pos_counter_{}".format(i)].append(pos_ngram_counter(gram_spans))
    #            feature_dict["first_pos_counter_{}".format(i)].append(first_pos_ngram_counter(gram_spans))

    #            feature_dict["tag_counter_{}".format(i)].append(tag_ngram_counter(gram_spans))
    #            feature_dict["first_tag_counter_{}".format(i)].append(first_tag_ngram_counter(gram_spans))

    #            feature_dict["ner_counter_{}".format(i)].append(ent_ngram_counter(gram_spans))
    #            feature_dict["first_ner_counter_{}".format(i)].append(first_ent_ngram_counter(gram_spans))

53827it [26:27, 33.92it/s]


In [14]:
df = pd.concat([df.reset_index(), pd.DataFrame(feature_dict).reset_index()], axis=1, sort=False)

In [15]:
df["contains_ner_type_str"] = [str(liste) for liste in df["contains_ner_type"]]
ner_counter_df = df["contains_ner_type_str"].value_counts()

contains_ner_type_str_reduced = []
for item in df["contains_ner_type"]:
    
    if len(item) < 2:
        contains_ner_type_str_reduced.append(str(item))
    else:
        contains_ner_type_str_reduced.append("Other")
    
    #if ner_counter_df[item] > threshold:
    #    contains_ner_type_str_reduced.append(item)
    #else:
    #    contains_ner_type_str_reduced.append("Other")
    
df["contains_ner_type_str_reduced"] =contains_ner_type_str_reduced

In [49]:
df.to_csv("random_train_question_features.csv")