# Start Stanford CoreNLP server
`java -Xmx16g -cp C:\stanford-corenlp-latest\stanford-corenlp-4.0.0\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 600 -threads 5 -maxCharLength 100000 -quiet False -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,coref`

In [1]:
import string
import re
import collections
import csv
import pandas as pd
import numpy as np
import requests
import os
import json
from hyphen import Hyphenator

import dask.dataframe as dd
import multiprocessing

import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

import seaborn as sns

%matplotlib inline

import stanfordnlp
from stanfordnlp.server import CoreNLPClient

# Uncomment if needed to fix this error:
# OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [2]:
PREDICTION_PATH = './predictions'
TEST_SETS_PATH = './test_sets'
MODEL_EVALS_URL = 'https://squad-model-evals.s3-us-west-2.amazonaws.com/model_db.json'

#SET_NAMES = ['Amazon', 'Reddit', 'New-Wiki', 'NYT', 'dev-v1.1']
SET_NAMES = ['Amazon', 'Reddit', 'New-Wiki', 'NYT']

In [3]:
def fetch_eval_file(eval_file_path, model_evals_url, overwrite=False):
    if (not os.path.exists(eval_file_path)) or overwrite:
        r = requests.get(model_evals_url)
                        
        with open(eval_file_path, 'w', encoding='utf-8') as outfile:
            outfile.write(r.text)

    else:
        print('File Exists')
    
    

def write_output(output_file_path, list_to_write):
    fields = list_to_write[0].keys()
    
    with open(output_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.DictWriter(csv_file, 
                                    fieldnames=fields,
                                    delimiter=',', 
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL )
        csv_writer.writeheader()
        csv_writer.writerows(list_to_write)

def parse_predictions(prediction_file_path, download=False):
    
    with open(prediction_file_path) as f:
      predictions = json.load(f)


    pred_list_test = [{ 'model_display_name': x['name'], 
      'model_name': x['metadata']['name'], 
      'description': x['metadata']['description'], 
      'uuid': x['metadata']['uuid'],
      'testbed': x['testbed'],
      'predictions': x['predictions']

     } for x in predictions]

    pred_list = []

    for r in predictions:

      display_name = r['name']
      model_name = r['metadata']['name']
      description = r['metadata']['description']
      uuid = r['metadata']['uuid']
      testbed = r['testbed']

      for k1, v1 in r['predictions'].items():
        if k1 in (SET_NAMES):
          if 'bundle' in v1.keys():
            test_set = k1
            bundle = v1['bundle']

            for k2, v2 in v1['data'].items():
              qid = k2
              predicted_answer = v2
              exact_match = v1['scores'][qid]['exact_match']
              f1 = v1['scores'][qid]['f1']

              pred_list.append( {
                'display_name': display_name,
                'model_name': model_name,
                'description': description,
                'uuid': uuid,
                'testbed': testbed,
                'test_set': test_set,
                'qid': qid,
                'predicted_answer': predicted_answer,
                'exact_match': exact_match,
                'f1': float(f1)
              })
   
    return pred_list

def load_data(input_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        return [r for r in csv_reader]

def parse_answers(answer_file_path):
    test_set_answers = [a for a in os.listdir(answer_file_path) if not os.path.isdir('/'.join([answer_file_path, a]))]
    answers_list = []
    
    for f in test_set_answers:
      with open('/'.join([TEST_SETS_PATH, f])) as fh:
          test_set = f.split('.')[0]
          
          answers = json.load(fh)['data']
          for x in answers:
              title = x['title']
    
              for p in x['paragraphs']:
                  context = p['context']
    
                  for qa in p['qas']:
                      question = qa['question']
                      question_id = qa['id']
    
                      for a in qa['answers']:
                          answers_list.append(
                                  {
                                      'test_set': test_set,
                                      'question_id': question_id,
                                      'title': title,
                                      'context': context,
                                      'question_text': question,
                                      'answer_text': a['text'],
                                      'answer_start': a['answer_start']
                                  }
                              )
    return answers_list

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
  
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()

def compute_exact(question_id, predicted_answer, all_answers):
    gold_answers = [normalize_answer(x['answer_text']) for x in all_answers if x['question_id'] == question_id]
    return max((int(normalize_answer(predicted_answer) == a) for a in gold_answers))

def compute_f1(question_id, predicted_answer, all_answers):
    gold_toks = [get_tokens(x['answer_text']) for x in all_answers if x['question_id'] == question_id]
    pred_toks = get_tokens(predicted_answer)
    
    f1s = []
  
    for answer_toks in gold_toks:
        common = collections.Counter(answer_toks) & collections.Counter(pred_toks)
        num_same = sum(common.values())
      
        if len(answer_toks) == 0 or len(pred_toks) == 0:
            # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
            f1s.append(float(int(answer_toks == pred_toks)))
            continue
        if num_same == 0:
            f1s.append(0.0)
            continue
            
        precision = 1.0 * num_same / len(pred_toks)
        recall = 1.0 * num_same / len(answer_toks)
        f1 = (2 * precision * recall) / (precision + recall)

        f1s.append(f1)

    return float(max(f1s))

def print_answer(qid, all_answers):
    question = [q for q in all_answers if q['question_id'] == qid]
    answers = [a['answer_text'] for a in question]
    
    if question:
        print('Test Set:', question[0]['test_set'])
        print('Context:', question[0]['context'])
        print('Question:', question[0]['question_text'])
        print('Answers:', answers)

In [4]:
# Download the model_db.json file that contains all the pre-evaluated and scored questions
# from the previous groups' work, if it doesn't exist yet.

fetch_eval_file(PREDICTION_PATH + '/model_db.json', MODEL_EVALS_URL, overwrite=False)

File Exists


In [5]:
# If predictions and/or answer files don't exist, uncomment these to recreate them

predictions = parse_predictions(PREDICTION_PATH + '/model_db.json')
answers = parse_answers(TEST_SETS_PATH)

write_output(PREDICTION_PATH + '/all_predictions.csv', predictions)
write_output(PREDICTION_PATH + '/all_answers.csv', answers)

In [6]:
# Otherwise, load from files
predictions = load_data(PREDICTION_PATH + '/all_predictions.csv')
answers = load_data(PREDICTION_PATH + '/all_answers.csv')   

In [7]:
# Load into Pandas dataframes

df_pred = pd.DataFrame(predictions)
df_answers = pd.DataFrame(answers)

df_pred = df_pred.astype({'f1': 'float'})

In [8]:
df_answers['is_numeric'] = df_answers.apply(lambda row: row['answer_text'].isnumeric(), axis=1)
df_answers[df_answers['is_numeric']]
df_answers.to_csv('answers.csv', index=False)

In [9]:
nlp = stanfordnlp.Pipeline(processors='tokenize', use_gpu=True)

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': 'C:\\Users\\justi\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


In [10]:
def get_all_stanford_metrics(txt):
    subtree_value = ''
    ner = '_NO_NER'
    sentence_count = 0
    word_count = 0 
    character_count = 0
    
    try:
        with CoreNLPClient(endpoint='http://localhost:9001', start_server=False, timeout=30000) as client:

            ann = client.annotate(txt)
            
            sentence_count = len(ann.sentence)
            words = [x.word for s in ann.sentence for x in s.token if x.word not in string.punctuation]
            word_count = len(words)
            character_count = sum([len(x) for x in words])
            
            sentence = ann.sentence[0]
            if sentence.mentions:
                ner = sentence.mentions[0].entityType
            
            constituency_parse = sentence.parseTree
            subtree_value = constituency_parse.child[0].value
        
        return subtree_value, ner, sentence_count, word_count, character_count
        
    except Exception as e:
        return e.args[0],e.args[0], e.args[0], e.args[0],e.args[0]
    
def get_stanford_counts(txt):
    sentence_count = 0
    word_count = 0 
    character_count = 0
    
    try:
        doc = nlp(txt)
        sentence_count = len(doc.sentences)
        words = [w.text for s in doc.sentences for w in s.words if w.text not in string.punctuation]
        word_count = len(words)
        character_count = sum([len(x) for x in words])
        
        return sentence_count, word_count, character_count, words
        
    except Exception as e:
        return e.args[0], e.args[0], e.args[0]
    


In [11]:
# Test Server
try:
    txt = 'This is a test sentence. So is this.'
    with CoreNLPClient(endpoint='http://localhost:9001', start_server=False, timeout=30000) as client:
        ann = client.annotate(txt)
        print('Server running. Found {} sentences'.format(len(ann.sentence)))
except Exception as e:
    print(e)

Server running. Found 2 sentences


In [12]:
# Write new results, if necessary
df_distinct_answers = pd.DataFrame({'answer_text': df_answers['answer_text'].unique()})
df_distinct_answers[['first_parse', 'first_ner', 'sentence_count', 'word_count', 'word_character_count', ]] = df_distinct_answers.apply(lambda row: get_all_stanford_metrics(row['answer_text']), axis=1, result_type='expand')


In [13]:
df_distinct_answers.fillna(value = {'first_ner':'_NO_NER'}).groupby(['first_ner']).count()
df_distinct_answers.to_csv('distinct_answers.csv', index=False)

In [14]:
df_distinct_context = pd.DataFrame({'context': df_answers['context'].unique()})

In [15]:
df_distinct_context[['sentence_count', 'word_count', 'word_character_count', 'words']] = df_distinct_context.apply(lambda row: get_stanford_counts(row['context']), axis=1, result_type='expand')

In [16]:
syll_df = dd.from_pandas(df_distinct_context, npartitions = 2*multiprocessing.cpu_count()) \
            .map_partitions(lambda df: df.apply(lambda row: [max(1, len(Hyphenator('en_US').syllables(x))) if len(str(x)) < 100 else -1 for x in row['words'] ], axis = 1)) \
            .compute(scheduler='processes')

In [17]:
df_distinct_context['syllables_per_word'] = syll_df

In [18]:
df_distinct_context['polysyllable_count'] = df_distinct_context.apply(lambda row: len([x for x in row['syllables_per_word'] if x > 1]), axis = 1)
df_distinct_context['avg_word_length'] = df_distinct_context.apply(lambda row: sum([len(x) for x in row['words']])/row['word_count'], axis = 1)

In [19]:
df_distinct_context['avg_sentence_length_in_words'] = df_distinct_context['word_count']/df_distinct_context['sentence_count']
df_distinct_context['context_character_count'] = df_distinct_context.apply(lambda row: len(row['context']), axis=1)
df_distinct_context['avg_sentence_length_in_characters'] = df_distinct_context['context_character_count']/df_distinct_context['sentence_count']
df_distinct_context['syllables_per_word'] = df_distinct_context.apply(lambda row: sum([x for x in row['syllables_per_word'] if x > 0])/ len([x for x in row['syllables_per_word'] if x > 0]) , axis=1)
df_distinct_context['flesch-kincaid_grade_level'] = df_distinct_context.apply(lambda row: (0.39 * row['avg_sentence_length_in_words']) + (11.8 * row['syllables_per_word']) - 15.59, axis=1)


In [20]:
df_distinct_context['coleman-liau'] = df_distinct_context.apply(lambda row: (0.0588 * (row['avg_word_length']) * 100) - (0.296 * (100/row['avg_sentence_length_in_words'])) - 15.8, axis=1)
df_distinct_context['gunning-fog'] = df_distinct_context.apply(lambda row: 0.4 * ((row['word_count'] / row['sentence_count']) + ((row['polysyllable_count'] / row['word_count']) * 100)), axis=1)
df_distinct_context['automated-readability'] = df_distinct_context.apply(lambda row: 4.71 * (row['context_character_count'] / row['word_count']) + 0.5 * (row['word_count'] / row['sentence_count']) - 21.43, axis=1)


In [21]:
df_distinct_context.to_csv('distinct_context.csv', index=False)


In [22]:
df_merged_answers = df_answers.merge(df_distinct_answers, on=['answer_text'])   

In [23]:
df_merged_answers['is_numeric'] = df_merged_answers.apply(lambda row: row['answer_text'].isnumeric(), axis=1)
df_merged_answers.to_csv('merged_answers.csv', index=False)

In [24]:
df_merged_answers_and_context = df_merged_answers.merge(df_distinct_context, on=['context'])   

In [25]:
df_merged_answers_and_context.to_csv('merged_answers_and_context.csv', index = False)

In [26]:
df_pred_answers_context = df_pred.merge(df_merged_answers_and_context, left_on=['qid'], right_on=['question_id'])

In [27]:
df_pred_answers_context['exact_match'] = df_pred_answers_context['exact_match'].map({'True':True, 'False':False})

In [28]:
df_pred_answers_context.to_csv('pred_answers_context.csv', index=False)