In [1]:
import os
import itertools
import time

import pandas as pd
import numpy as np
import datefinder
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.tag.stanford import StanfordNERTagger, StanfordPOSTagger

In [2]:
os.chdir('../..')

In [3]:
from src.analysis.utils import load_squadv1_dev_as_df, load_squadv2_dev_as_df

In [4]:
%%capture
squad_df = load_squadv2_dev_as_df()

2021-07-15 15:50:49,534 - Loading SQuAD v2 data as DataFrame
2021-07-15 15:50:51,132 - Reusing dataset squad_v2 (/Users/stevengeorge/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/ba48bc29b974701e9ba8d80ac94f3e3df924aba41b764dcf9851debea7c672e4)
2021-07-15 15:50:52,631 - (11873, 5)
2021-07-15 15:50:52,632 -                                              answers  \
0  {'answer_start': [159, 159, 159, 159], 'text':...   
1  {'answer_start': [94, 87, 94, 94], 'text': ['1...   
2  {'answer_start': [256, 256, 256, 256], 'text':...   
3  {'answer_start': [308, 308, 308, 308], 'text':...   
4  {'answer_start': [671, 649, 671, 671], 'text':...   

                                             context  \
0  The Normans (Norman: Nourmands; French: Norman...   
1  The Normans (Norman: Nourmands; French: Norman...   
2  The Normans (Norman: Nourmands; French: Norman...   
3  The Normans (Norman: Nourmands; French: Norman...   
4  The Normans (Norman: Nourmands; French: Norman...   

            

In [5]:
print(squad_df.shape)
squad_df.head()

(11873, 5)


Unnamed: 0,answers,context,id,question,title
0,"{'answer_start': [159, 159, 159, 159], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9628,In what country is Normandy located?,Normans
1,"{'answer_start': [94, 87, 94, 94], 'text': ['1...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9629,When were the Normans in Normandy?,Normans
2,"{'answer_start': [256, 256, 256, 256], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962a,From which countries did the Norse originate?,Normans
3,"{'answer_start': [308, 308, 308, 308], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962b,Who was the Norse leader?,Normans
4,"{'answer_start': [671, 649, 671, 671], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,Normans


In [6]:
squad_df = pd.concat(
    (squad_df, squad_df['answers'].apply(pd.Series)),
    axis=1
)

In [7]:
squad_df.head()

Unnamed: 0,answers,context,id,question,title,answer_start,text
0,"{'answer_start': [159, 159, 159, 159], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9628,In what country is Normandy located?,Normans,"[159, 159, 159, 159]","[France, France, France, France]"
1,"{'answer_start': [94, 87, 94, 94], 'text': ['1...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9629,When were the Normans in Normandy?,Normans,"[94, 87, 94, 94]","[10th and 11th centuries, in the 10th and 11th..."
2,"{'answer_start': [256, 256, 256, 256], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962a,From which countries did the Norse originate?,Normans,"[256, 256, 256, 256]","[Denmark, Iceland and Norway, Denmark, Iceland..."
3,"{'answer_start': [308, 308, 308, 308], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962b,Who was the Norse leader?,Normans,"[308, 308, 308, 308]","[Rollo, Rollo, Rollo, Rollo]"
4,"{'answer_start': [671, 649, 671, 671], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,Normans,"[671, 649, 671, 671]","[10th century, the first half of the 10th cent..."


In [8]:
squad_df['unanswerable'] = np.where(
    squad_df['answer_start'].apply(len) == 0,
    1,
    0
)

In [9]:
squad_df['unanswerable'].sum()

5945

In [11]:
squad_df = squad_df[squad_df['unanswerable'] == 0]

In [12]:
def get_majority(answer_list):
    """Extract the majority vote answer or first answer in case of no majority for the dev data for consistency"""
    dict_answer_counts = {}
    for i, ans in enumerate(answer_list):
        if ans['text'] in dict_answer_counts:
            dict_answer_counts[ans['text']]['count'] += 1
        else:
            dict_answer_counts[ans['text']] = {
                'id': i,  # return the first occurring answer or first answer seen
                'count': 1
            }

    # Extract counts and indices
    count_indices = [(ans['count'], ans['id']) for ans in dict_answer_counts.values()]
    # Sort, first by index ascending, then by count descending
    count_indices = sorted(sorted(count_indices, key=lambda x: x[1]), key=lambda x: x[0], reverse=True)

    # Check that we have as many counts as expects
    assert len(answer_list) == sum(x[0] for x in count_indices)

    # Return the most common answer by index in the list
    return [answer_list[count_indices[0][1]]]


In [13]:
squad_df['text_dict_format'] = squad_df['text'].apply(lambda x: [{'text': text} for text in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
squad_df.head()

Unnamed: 0,answers,context,id,question,title,answer_start,text,unanswerable,text_dict_format
0,"{'answer_start': [159, 159, 159, 159], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9628,In what country is Normandy located?,Normans,"[159, 159, 159, 159]","[France, France, France, France]",0,"[{'text': 'France'}, {'text': 'France'}, {'tex..."
1,"{'answer_start': [94, 87, 94, 94], 'text': ['1...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9629,When were the Normans in Normandy?,Normans,"[94, 87, 94, 94]","[10th and 11th centuries, in the 10th and 11th...",0,"[{'text': '10th and 11th centuries'}, {'text':..."
2,"{'answer_start': [256, 256, 256, 256], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962a,From which countries did the Norse originate?,Normans,"[256, 256, 256, 256]","[Denmark, Iceland and Norway, Denmark, Iceland...",0,"[{'text': 'Denmark, Iceland and Norway'}, {'te..."
3,"{'answer_start': [308, 308, 308, 308], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962b,Who was the Norse leader?,Normans,"[308, 308, 308, 308]","[Rollo, Rollo, Rollo, Rollo]",0,"[{'text': 'Rollo'}, {'text': 'Rollo'}, {'text'..."
4,"{'answer_start': [671, 649, 671, 671], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,Normans,"[671, 649, 671, 671]","[10th century, the first half of the 10th cent...",0,"[{'text': '10th century'}, {'text': 'the first..."


In [15]:
squad_df['majority_vote_answer'] = squad_df['text_dict_format'].apply(lambda x: get_majority(x)[0]['text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
squad_df.head()

Unnamed: 0,answers,context,id,question,title,answer_start,text,unanswerable,text_dict_format,majority_vote_answer
0,"{'answer_start': [159, 159, 159, 159], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9628,In what country is Normandy located?,Normans,"[159, 159, 159, 159]","[France, France, France, France]",0,"[{'text': 'France'}, {'text': 'France'}, {'tex...",France
1,"{'answer_start': [94, 87, 94, 94], 'text': ['1...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9629,When were the Normans in Normandy?,Normans,"[94, 87, 94, 94]","[10th and 11th centuries, in the 10th and 11th...",0,"[{'text': '10th and 11th centuries'}, {'text':...",10th and 11th centuries
2,"{'answer_start': [256, 256, 256, 256], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962a,From which countries did the Norse originate?,Normans,"[256, 256, 256, 256]","[Denmark, Iceland and Norway, Denmark, Iceland...",0,"[{'text': 'Denmark, Iceland and Norway'}, {'te...","Denmark, Iceland and Norway"
3,"{'answer_start': [308, 308, 308, 308], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962b,Who was the Norse leader?,Normans,"[308, 308, 308, 308]","[Rollo, Rollo, Rollo, Rollo]",0,"[{'text': 'Rollo'}, {'text': 'Rollo'}, {'text'...",Rollo
4,"{'answer_start': [671, 649, 671, 671], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,Normans,"[671, 649, 671, 671]","[10th century, the first half of the 10th cent...",0,"[{'text': '10th century'}, {'text': 'the first...",10th


In [36]:
get_majority([{'text': 'Italy'}, {'text': 'France'}, {'text': 'France'}, {'text': 'Spain'}, {'text': 'Spain'}])

[{'text': 'France'}]

In [22]:
# [{'text': country} for country in ['France', 'Italy', 'Spain', 'Spain']]

[{'text': 'France'}, {'text': 'Italy'}, {'text': 'Spain'}, {'text': 'Spain'}]

In [16]:
# dict(zip(['text'], ['France', 'France', 'France', 'France']))

{'text': 'France'}

In [17]:
# ['text']*4

['text', 'text', 'text', 'text']

In [26]:
# squad_df['PROCESS_answer'] = ""
# for idx in squad_df.index:
#     if squad_df.loc[idx]['unanswerable'] == 0:
#         squad_df.loc[idx, 'PROCESS_answer'] = squad_df.loc[idx]['text'][0]
#     else:
#         squad_df.loc[idx, 'PROCESS_answer'] = ""

In [17]:
df = squad_df.copy()

In [18]:
NER_tagger = StanfordNERTagger(
#     '/Users/max/_tools/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',
#     '/Users/max/_tools/stanford-ner-2017-06-09/stanford-ner.jar', 
    '/Users/stevengeorge/Desktop/stanford-ner-2020-11-17/classifiers/english.all.3class.distsim.crf.ser.gz',
    '/Users/stevengeorge/Desktop/stanford-ner-2020-11-17/stanford-ner.jar',
    encoding='utf-8'
)

In [19]:
POS_tagger = StanfordPOSTagger(
#     '/Users/max/_tools/stanford-postagger-2017-06-09/models/english-left3words-distsim.tagger',
#     '/Users/max/_tools/stanford-postagger-2017-06-09/stanford-postagger.jar',
    '/Users/stevengeorge/Desktop/stanford-postagger-full-2020-11-17/models/english-left3words-distsim.tagger',
    '/Users/stevengeorge/Desktop/stanford-postagger-full-2020-11-17/stanford-postagger.jar',
    encoding='utf-8'
)

In [20]:
df.head()

Unnamed: 0,answers,context,id,question,title,answer_start,text,unanswerable,text_dict_format,majority_vote_answer
0,"{'answer_start': [159, 159, 159, 159], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9628,In what country is Normandy located?,Normans,"[159, 159, 159, 159]","[France, France, France, France]",0,"[{'text': 'France'}, {'text': 'France'}, {'tex...",France
1,"{'answer_start': [94, 87, 94, 94], 'text': ['1...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b9629,When were the Normans in Normandy?,Normans,"[94, 87, 94, 94]","[10th and 11th centuries, in the 10th and 11th...",0,"[{'text': '10th and 11th centuries'}, {'text':...",10th and 11th centuries
2,"{'answer_start': [256, 256, 256, 256], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962a,From which countries did the Norse originate?,Normans,"[256, 256, 256, 256]","[Denmark, Iceland and Norway, Denmark, Iceland...",0,"[{'text': 'Denmark, Iceland and Norway'}, {'te...","Denmark, Iceland and Norway"
3,"{'answer_start': [308, 308, 308, 308], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962b,Who was the Norse leader?,Normans,"[308, 308, 308, 308]","[Rollo, Rollo, Rollo, Rollo]",0,"[{'text': 'Rollo'}, {'text': 'Rollo'}, {'text'...",Rollo
4,"{'answer_start': [671, 649, 671, 671], 'text':...",The Normans (Norman: Nourmands; French: Norman...,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,Normans,"[671, 649, 671, 671]","[10th century, the first half of the 10th cent...",0,"[{'text': '10th century'}, {'text': 'the first...",10th


In [21]:
from tqdm import tqdm

In [31]:
# for i in tqdm(range(squad1_df.shape[0])):
#     if (i == 3124) | (i == 8586):  # Problematic examples
#         continue
#     try:
#         list(datefinder.find_dates(squad1_df.iloc[i]['PROCESS_answer']))
#     except:
#         print(i, squad1_df.iloc[i]['PROCESS_answer'])
#         raise

100%|██████████| 10570/10570 [00:02<00:00, 3822.04it/s]


In [17]:
# df.drop(index=[3124, 8586], inplace=True)

In [12]:
# def is_date(text):
#     # a generator will be returned by the datefinder module.
#     matches = datefinder.find_dates(text)
    
#     try:   
#         matches = list(matches)
#     except:
#         return False

#     if len(matches) > 0:
#         return True
#     return datefinder.find_dates(text)

In [66]:
# for i in df['PROCESS_answer'].apply(is_date)[3124]:
#     print(i)

In [67]:
# df['PROCESS_answer'].apply(is_date)[10524]

In [68]:
# list(datefinder.find_dates(df['PROCESS_answer'].loc[10524]))

In [22]:
def is_numeric(text):
    return any(char.isdigit() for char in str(text))

def is_date(text):
    # a generator will be returned by the datefinder module.
    matches = list(datefinder.find_dates(text))

    if len(matches) > 0:
        return True
    return False

def classify_NER_text(NER_text):
    NER_tag = NER_text[1]
    if  NER_tag == 'PERSON':
        return 'Person'
    elif NER_tag == 'LOCATION':
        return 'Location'
    elif NER_tag == 'ORGANIZATION':
        return 'Organisation'
    else:
        return 'Other Entity'
    
def most_common(lst):
    return max(set(lst), key=lst.count)

def verify_single_verb_group(POS_list):
    # Convert any verb-related tags to VB
    POS_list = ['VB' if 'VB' in x else x for x in POS_list]
    # Eliminate repeating elements (ie verb groups)
    POS_list = [k for k, g in itertools.groupby(POS_list)]
    # Check if the reduced list contains just one verb group
    if POS_list.count('VB') == 1:
        return True
    return False

def get_char_length(sent):
    return len(sent)

def get_word_length(sent):
    return len(sent.strip())

def classify_text(text, POS_text, NER_text):
    text_class = ''
    if is_numeric(text):
        if is_date(text):
            return 'Date'
        else:
            return 'Other Numeric'
    else:
        sent = wordpunct_tokenize(text)
        # POS_text = POS_tagger.tag(sent)
        POS_tags = np.array(POS_text)[:,1]
        
        # Detect Proper Noun Phrases
        if any('NNP' in POS for POS in POS_tags) and not any(tag in POS for tag in ['VB'] for POS in POS_tags):
            # NER classify proper nouns
            # NER_list = NER_tagger.tag([word for word, POS in POS_text if 'NNP' in POS])
            NER_list = [(word, NER) for (word, NER), POS in zip(NER_text, POS_tags) if 'NNP' in POS]
            NER_list = [classify_NER_text(NER_text) for NER_text in NER_list]
            return most_common(NER_list)
        
        # Detect Common Noun Phrases
        elif any('NN' in POS for POS in POS_tags) and not any(tag in POS for tag in ['VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] for POS in POS_tags):
            return 'Common Noun Phrase'
        
        # Detect Clauses (ie one verb group and one noun http://grammar.yourdictionary.com/grammar-rules-and-tips/Grammar-Clause.html)
        # https://www.merriam-webster.com/dictionary/clause
        elif any('NN' in POS for POS in POS_tags) and any(POS in ['VBD', 'VBG', 'VBP', 'VBZ'] for POS in POS_tags) and verify_single_verb_group(POS_tags):
            return 'Clause'
        
        # Detect Verb Phrase
        elif any('VB' in POS for POS in POS_tags):
            return 'Verb Phrase'
        
        # Detect Adjective Phrase
        elif any('JJ' in POS for POS in POS_tags):
            return 'Adjective Phrase'

        # Other
        else:
            return 'Other'


In [70]:
# # Extract the categories
# t0 = time.time()
# answers_tok = df.PROCESS_answer.apply(wordpunct_tokenize).tolist()
# df['ANALYSIS_answer_POS_tags'] = POS_tagger.tag_sents(answers_tok)
# df['ANALYSIS_answer_NER_tags'] = NER_tagger.tag_sents(answers_tok)
# df['ANALYSIS_answer_type'] = df.apply(lambda x: classify_text(x['PROCESS_answer'], x['ANALYSIS_answer_POS_tags'], x['ANALYSIS_answer_NER_tags']), axis=1)
# print("Time taken: {:.1f}s".format(time.time() - t0))
# # Time taken: 4.4s


In [28]:
# Extract the categories
t0 = time.time()
answers_tok = df.majority_vote_answer.apply(wordpunct_tokenize).tolist()
df['ANALYSIS_answer_POS_tags'] = POS_tagger.tag_sents(answers_tok)
df['ANALYSIS_answer_NER_tags'] = NER_tagger.tag_sents(answers_tok)

df['ANALYSIS_answer_type'] = ""
for idx in tqdm(df.index):
    try:
        df.loc[idx, 'ANALYSIS_answer_type'] = classify_text(df.loc[idx]['majority_vote_answer'], df.loc[idx]['ANALYSIS_answer_POS_tags'], df.loc[idx]['ANALYSIS_answer_NER_tags'])
    except Exception as e:
        print(idx, "//", df.loc[idx]['majority_vote_answer'], "//", e)
        df.loc[idx, 'ANALYSIS_answer_type'] = 'Other Numeric'
    
print("Time taken: {:.1f}s".format(time.time() - t0))
# Time taken: 4.4s


 17%|█▋        | 993/5928 [00:00<00:03, 1418.01it/s]

1577 // 1759-60 // unsupported operand type(s) for +: 'int' and 'str'


 75%|███████▌  | 4462/5928 [00:03<00:01, 1433.19it/s]

8479 // Resolution 43/53 // unsupported operand type(s) for +: 'int' and 'str'


100%|██████████| 5928/5928 [00:04<00:00, 1425.67it/s]

Time taken: 9.5s





In [29]:
df.isnull().sum()

answers                     0
context                     0
id                          0
question                    0
title                       0
answer_start                0
text                        0
unanswerable                0
text_dict_format            0
majority_vote_answer        0
ANALYSIS_answer_POS_tags    0
ANALYSIS_answer_NER_tags    0
ANALYSIS_answer_type        0
dtype: int64

In [31]:
df['ANALYSIS_answer_type'].value_counts(dropna=False) / df.shape[0]

Common Noun Phrase    0.331478
Other Numeric         0.135628
Other Entity          0.105263
Organisation          0.076923
Person                0.066296
Verb Phrase           0.061741
Location              0.061404
Adjective Phrase      0.049426
Date                  0.046896
Clause                0.039136
Other                 0.025810
Name: ANALYSIS_answer_type, dtype: float64

In [33]:
max_results = pd.DataFrame(
    [
        ('Common Noun Phrase', 31.769157994323553), 
        ('Other Numeric', 15.676442762535478),
        ('Person', 9.2904446546830659),
        ('Other Entity', 9.0255439924314089),
        ('Organisation', 8.7133396404919594),
        ('Verb Phrase', 5.7237464522232733),
        ('Location', 5.1182592242194884),
        ('Date', 4.049195837275307),
        ('Clause', 4.0113528855250715),
        ('Adjective Phrase', 3.9640491958372754),
        ('Other', 2.6584673604541154)
    ], 
    columns=['Answer type', 'Percentage']
)
max_results

Unnamed: 0,Answer type,Percentage
0,Common Noun Phrase,31.769158
1,Other Numeric,15.676443
2,Person,9.290445
3,Other Entity,9.025544
4,Organisation,8.71334
5,Verb Phrase,5.723746
6,Location,5.118259
7,Date,4.049196
8,Clause,4.011353
9,Adjective Phrase,3.964049


In [34]:
max_results.sum()

Answer type    Common Noun PhraseOther NumericPersonOther Ent...
Percentage                                                 100.0
dtype: object