# Start Stanford CoreNLP server
`java -Xmx16g -cp C:\stanford-corenlp-latest\stanford-corenlp-4.0.0\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9002 -timeout 600 -threads 5 -maxCharLength 100000 -quiet False -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,coref`

In [34]:
import string
import re
import collections
import csv
import pandas as pd
import numpy as np
import requests
import os
import json
from hyphen import Hyphenator

import dask.dataframe as dd
import multiprocessing

import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

import seaborn as sns

%matplotlib inline

import stanfordnlp
from stanfordnlp.server import CoreNLPClient

import nltk
nltk.download('punkt')

# Uncomment if needed to fix this error:
# OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

NLP_PORT = 9002

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Justin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:
PREDICTION_PATH = './predictions'
TEST_SETS_PATH = './test_sets'
MODEL_EVALS_URL = 'https://squad-model-evals.s3-us-west-2.amazonaws.com/model_db.json'

#SET_NAMES = ['Amazon', 'Reddit', 'New-Wiki', 'NYT', 'dev-v1.1']
SET_NAMES = ['Amazon', 'Reddit', 'New-Wiki', 'NYT']

In [3]:
def fetch_eval_file(eval_file_path, model_evals_url, overwrite=False):
    if (not os.path.exists(eval_file_path)) or overwrite:
        r = requests.get(model_evals_url)
                        
        with open(eval_file_path, 'w', encoding='utf-8') as outfile:
            outfile.write(r.text)

    else:
        print('File Exists')
    
    

def write_output(output_file_path, list_to_write):
    fields = list_to_write[0].keys()
    
    with open(output_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.DictWriter(csv_file, 
                                    fieldnames=fields,
                                    delimiter=',', 
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL )
        csv_writer.writeheader()
        csv_writer.writerows(list_to_write)

def parse_predictions(prediction_file_path, download=False):
    
    with open(prediction_file_path) as f:
      predictions = json.load(f)


    pred_list_test = [{ 'model_display_name': x['name'], 
      'model_name': x['metadata']['name'], 
      'description': x['metadata']['description'], 
      'uuid': x['metadata']['uuid'],
      'testbed': x['testbed'],
      'predictions': x['predictions']

     } for x in predictions]

    pred_list = []

    for r in predictions:

      display_name = r['name']
      model_name = r['metadata']['name']
      description = r['metadata']['description']
      uuid = r['metadata']['uuid']
      testbed = r['testbed']

      for k1, v1 in r['predictions'].items():
        if k1 in (SET_NAMES):
          if 'bundle' in v1.keys():
            test_set = k1
            bundle = v1['bundle']

            for k2, v2 in v1['data'].items():
              qid = k2
              predicted_answer = v2
              exact_match = v1['scores'][qid]['exact_match']
              f1 = v1['scores'][qid]['f1']

              pred_list.append( {
                'display_name': display_name,
                'model_name': model_name,
                'description': description,
                'uuid': uuid,
                'testbed': testbed,
                'test_set': test_set,
                'qid': qid,
                'predicted_answer': predicted_answer,
                'exact_match': exact_match,
                'f1': float(f1)
              })
   
    return pred_list

def load_data(input_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        return [r for r in csv_reader]

def parse_answers(answer_file_path):
    test_set_answers = [a for a in os.listdir(answer_file_path) if not os.path.isdir('/'.join([answer_file_path, a]))]
    answers_list = []
    
    for f in test_set_answers:
      with open('/'.join([TEST_SETS_PATH, f])) as fh:
          test_set = f.split('.')[0]
          
          answers = json.load(fh)['data']
          for x in answers:
              title = x['title']
    
              for p in x['paragraphs']:
                  context = p['context']
    
                  for qa in p['qas']:
                      question = qa['question']
                      question_id = qa['id']
    
                      for a in qa['answers']:
                          answers_list.append(
                                  {
                                      'test_set': test_set,
                                      'question_id': question_id,
                                      'title': title,
                                      'context': context,
                                      'question_text': question,
                                      'answer_text': a['text'],
                                      'answer_start': a['answer_start']
                                  }
                              )
    return answers_list

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
  
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()

def compute_exact(question_id, predicted_answer, all_answers):
    gold_answers = [normalize_answer(x['answer_text']) for x in all_answers if x['question_id'] == question_id]
    return max((int(normalize_answer(predicted_answer) == a) for a in gold_answers))

def compute_f1(question_id, predicted_answer, all_answers):
    gold_toks = [get_tokens(x['answer_text']) for x in all_answers if x['question_id'] == question_id]
    pred_toks = get_tokens(predicted_answer)
    
    f1s = []
  
    for answer_toks in gold_toks:
        common = collections.Counter(answer_toks) & collections.Counter(pred_toks)
        num_same = sum(common.values())
      
        if len(answer_toks) == 0 or len(pred_toks) == 0:
            # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
            f1s.append(float(int(answer_toks == pred_toks)))
            continue
        if num_same == 0:
            f1s.append(0.0)
            continue
            
        precision = 1.0 * num_same / len(pred_toks)
        recall = 1.0 * num_same / len(answer_toks)
        f1 = (2 * precision * recall) / (precision + recall)

        f1s.append(f1)

    return float(max(f1s))

def print_answer(qid, all_answers):
    question = [q for q in all_answers if q['question_id'] == qid]
    answers = [a['answer_text'] for a in question]
    
    if question:
        print('Test Set:', question[0]['test_set'])
        print('Context:', question[0]['context'])
        print('Question:', question[0]['question_text'])
        print('Answers:', answers)

In [4]:
# Test Server
try:
    txt = 'This is a test sentence. So is this.'
    with CoreNLPClient(endpoint='http://localhost:{}'.format(NLP_PORT), start_server=False, timeout=30000) as client:
        ann = client.annotate(txt)
        print('Server running. Found {} sentences'.format(len(ann.sentence)))
except Exception as e:
    print(e)

Server running. Found 2 sentences


In [5]:
# Download the model_db.json file that contains all the pre-evaluated and scored questions
# from the previous groups' work, if it doesn't exist yet.

fetch_eval_file(PREDICTION_PATH + '/model_db.json', MODEL_EVALS_URL, overwrite=False)

File Exists


In [6]:
# If predictions and/or answer files don't exist, uncomment these to recreate them

predictions = parse_predictions(PREDICTION_PATH + '/model_db.json')
answers = parse_answers(TEST_SETS_PATH)

write_output(PREDICTION_PATH + '/all_predictions.csv', predictions)
write_output(PREDICTION_PATH + '/all_answers.csv', answers)

In [7]:
# Otherwise, load from files
predictions = load_data(PREDICTION_PATH + '/all_predictions.csv')
answers = load_data(PREDICTION_PATH + '/all_answers.csv')   

In [8]:
# Load into Pandas dataframes

df_pred = pd.DataFrame(predictions)
df_answers = pd.DataFrame(answers)

df_pred = df_pred.astype({'f1': 'float'})

In [9]:
df_answers['is_numeric'] = df_answers.apply(lambda row: row['answer_text'].isnumeric(), axis=1)
df_answers[df_answers['is_numeric']]

Unnamed: 0,test_set,question_id,title,context,question_text,answer_text,answer_start,is_numeric
66,amazon_reviews_v1,5dd4661fcc027a086d65bc77,Amazon_Reviews_530,"i wanted an electric kettle, but landed up ord...",How many irritations are there?,2,169,True
67,amazon_reviews_v1,5dd4661fcc027a086d65bc77,Amazon_Reviews_530,"i wanted an electric kettle, but landed up ord...",How many irritations are there?,2,169,True
68,amazon_reviews_v1,5dd4661fcc027a086d65bc77,Amazon_Reviews_530,"i wanted an electric kettle, but landed up ord...",How many irritations are there?,2,169,True
151,amazon_reviews_v1,5dd4673dcc027a086d65bcec,Amazon_Reviews_295,I ordered these sheets and must say was a bit ...,what is the thread count on the sheets?,1500,91,True
152,amazon_reviews_v1,5dd4673dcc027a086d65bcec,Amazon_Reviews_295,I ordered these sheets and must say was a bit ...,what is the thread count on the sheets?,1500,126,True
...,...,...,...,...,...,...,...,...
106622,reddit_v1,5d9c9bbd8ae5305bc982f410,Filtered_Reddit_Comments,Sales Professional Development Help? Hello All...,How many people does the company that I work f...,8,302,True
106688,reddit_v1,5d9ca3b18ae5305bc982f454,Filtered_Reddit_Comments,"David Pastrnak's WJC is over. 19 SOG, 1 G, 3 A...",How many did he play in?,4,99,True
106689,reddit_v1,5d9ca3b18ae5305bc982f454,Filtered_Reddit_Comments,"David Pastrnak's WJC is over. 19 SOG, 1 G, 3 A...",How many did he play in?,4,99,True
106760,reddit_v1,5d9cae448ae5305bc982f492,Filtered_Reddit_Comments,[AMA Request] A scalper / secondary-market tic...,What's the total number of questions the autho...,5,60,True


In [10]:
nlp = stanfordnlp.Pipeline(processors='tokenize', use_gpu=True)

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': 'C:\\Users\\justin.stanley\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


In [58]:
def get_all_stanford_metrics(txt):
    subtree_value = ''
    ner = '_NO_NER'
    sentence_count = 0
    word_count = 0 
    character_count = 0
    
    try:
        with CoreNLPClient(endpoint='http://localhost:{}'.format(NLP_PORT), start_server=False, timeout=30000) as client:

            ann = client.annotate(txt)
            
            sentence_count = len(ann.sentence)
            words = [x.word for s in ann.sentence for x in s.token if x.word not in string.punctuation]
            word_count = len(words)
            character_count = sum([len(x) for x in words])
            
            sentence = ann.sentence[0]
            if sentence.mentions:
                ner = sentence.mentions[0].entityType
            
            constituency_parse = sentence.parseTree
            subtree_value = constituency_parse.child[0].value
        
        return subtree_value, ner, sentence_count, word_count, character_count
        
    except Exception as e:
        return e.args[0],e.args[0], e.args[0], e.args[0],e.args[0]
    
def get_stanford_counts(txt):
    sentence_count = 0
    word_count = 0 
    character_count = 0
    
    try:
        doc = nlp(txt)
        sentence_count = len(doc.sentences)
        words = [w.text for s in doc.sentences for w in s.words if w.text not in string.punctuation]
        word_count = len(words)
        character_count = sum([len(x) for x in words])
        
        return sentence_count, word_count, character_count, words
        
    except Exception as e:
        return e.args[0], e.args[0], e.args[0]


In [12]:
df_distinct_answers = pd.DataFrame({'answer_text': df_answers['answer_text'].unique()})
df_distinct_answers[['first_parse', 'first_ner', 'sentence_count', 'word_count', 'word_character_count', ]] = df_distinct_answers.apply(lambda row: get_all_stanford_metrics(row['answer_text']), axis=1, result_type='expand')

In [13]:
df_distinct_answers.fillna(value = {'first_ner':'_NO_NER'}).groupby(['first_ner']).count()

Unnamed: 0_level_0,answer_text,first_parse,sentence_count,word_count,word_character_count
first_ner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CAUSE_OF_DEATH,386,386,386,386,386
CITY,581,581,581,581,581
COUNTRY,604,604,604,604,604
CRIMINAL_CHARGE,166,166,166,166,166
CoreNLP request timed out. Your document may be too long.,1,1,1,1,1
DATE,1934,1934,1934,1934,1934
DURATION,1232,1232,1232,1232,1232
HANDLE,6,6,6,6,6
IDEOLOGY,264,264,264,264,264
LOCATION,610,610,610,610,610


In [82]:
df_distinct_context = df_answers[['test_set','context']].drop_duplicates().reset_index()

In [83]:
df_distinct_context[['sentence_count', 'word_count', 'word_character_count', 'words']] = df_distinct_context.apply(lambda row: get_stanford_counts(row['context']), axis=1, result_type='expand')

In [84]:
df_distinct_context[:5]

Unnamed: 0,index,test_set,context,sentence_count,word_count,word_character_count,words
0,0,amazon_reviews_v1,It's a very nice holder - not too big and not ...,4,47,175,"[It, 's, a, very, nice, holder, not, too, big,..."
1,15,amazon_reviews_v1,"First of all, this thing is freakin' awesome. ...",9,159,630,"[First, of, all, this, thing, is, freakin', aw..."
2,27,amazon_reviews_v1,"The presto my pod makes, to me, a very weak an...",8,117,431,"[The, presto, my, pod, makes, to, me, a, very,..."
3,39,amazon_reviews_v1,This product takes 10 minutes to setup 4. It i...,14,123,458,"[This, product, takes, 10, minutes, to, setup,..."
4,51,amazon_reviews_v1,I have always kept a dustbuster in my kitchen ...,15,182,749,"[I, have, always, kept, a, dustbuster, in, my,..."


In [16]:
syll_df = dd.from_pandas(df_distinct_context, npartitions = 2*multiprocessing.cpu_count()) \
            .map_partitions(lambda df: df.apply(lambda row: [max(1, len(Hyphenator('en_US').syllables(x))) if len(str(x)) < 100 else -1 for x in row['words'] ], axis = 1)) \
            .compute(scheduler='processes')

In [17]:
df_distinct_context['syllables_per_word'] = syll_df

In [18]:
df_distinct_context['polysyllable_count'] = df_distinct_context.apply(lambda row: len([x for x in row['syllables_per_word'] if x > 1]), axis = 1)
df_distinct_context['avg_word_length'] = df_distinct_context.apply(lambda row: sum([len(x) for x in row['words']])/row['word_count'], axis = 1)

In [19]:
df_distinct_context['avg_sentence_length_in_words'] = df_distinct_context['word_count']/df_distinct_context['sentence_count']
df_distinct_context['context_character_count'] = df_distinct_context.apply(lambda row: len(row['context']), axis=1)
df_distinct_context['avg_sentence_length_in_characters'] = df_distinct_context['context_character_count']/df_distinct_context['sentence_count']
df_distinct_context['syllables_per_word'] = df_distinct_context.apply(lambda row: sum([x for x in row['syllables_per_word'] if x > 0])/ len([x for x in row['syllables_per_word'] if x > 0]) , axis=1)
df_distinct_context['flesch-kincaid_grade_level'] = df_distinct_context.apply(lambda row: (0.39 * row['avg_sentence_length_in_words']) + (11.8 * row['syllables_per_word']) - 15.59, axis=1)

In [20]:
df_distinct_context['coleman-liau'] = df_distinct_context.apply(lambda row: (0.0588 * (row['avg_word_length']) * 100) - (0.296 * (100/row['avg_sentence_length_in_words'])) - 15.8, axis=1)
df_distinct_context['gunning-fog'] = df_distinct_context.apply(lambda row: 0.4 * ((row['word_count'] / row['sentence_count']) + ((row['polysyllable_count'] / row['word_count']) * 100)), axis=1)
df_distinct_context['automated-readability'] = df_distinct_context.apply(lambda row: 4.71 * (row['context_character_count'] / row['word_count']) + 0.5 * (row['word_count'] / row['sentence_count']) - 21.43, axis=1)

In [21]:
df_merged_answers = df_answers.merge(df_distinct_answers, on=['answer_text'])   

In [22]:
df_merged_answers['is_numeric'] = df_merged_answers.apply(lambda row: row['answer_text'].isnumeric(), axis=1)

In [23]:
df_merged_answers_and_context = df_merged_answers.merge(df_distinct_context, on=['context'])   

In [24]:
df_pred_answers_context = df_pred.merge(df_merged_answers_and_context, left_on=['qid'], right_on=['question_id'])

In [25]:
df_pred_answers_context['exact_match'] = df_pred_answers_context['exact_match'].map({'True':True, 'False':False})

In [26]:
df_answers.to_csv('answers.csv', index=False)
df_distinct_answers.to_csv('distinct_answers.csv', index=False)
df_distinct_context.to_csv('distinct_context.csv', index=False)
df_merged_answers.to_csv('merged_answers.csv', index=False)
df_merged_answers_and_context.to_csv('merged_answers_and_context.csv', index = False)
df_pred_answers_context.to_csv('pred_answers_context.csv', index=False)

In [27]:
df_pred_answers_context[:5]

Unnamed: 0,display_name,model_name,description,uuid,testbed,test_set,qid,predicted_answer,exact_match,f1,...,syllables_per_word,polysyllable_count,avg_word_length,avg_sentence_length_in_words,context_character_count,avg_sentence_length_in_characters,flesch-kincaid_grade_level,coleman-liau,gunning-fog,automated-readability
0,XLNET-123 (single model),xlnet-123(singlemodel),XLNET-123 (single model),0x8d330a,John,Amazon,5dd465dacc027a086d65bc6c,not too big and not too small,True,1.0,...,1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511
1,XLNET-123 (single model),xlnet-123(singlemodel),XLNET-123 (single model),0x8d330a,John,Amazon,5dd465dacc027a086d65bc6c,not too big and not too small,True,1.0,...,1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511
2,XLNET-123 (single model),xlnet-123(singlemodel),XLNET-123 (single model),0x8d330a,John,Amazon,5dd465dacc027a086d65bc6c,not too big and not too small,True,1.0,...,1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511
3,Tuned BERT-1seq Large Cased (single model),tunedbert-1seqlargecased(singlemodel),Tuned BERT-1seq Large Cased (single model),0xf776d7,John,Amazon,5dd465dacc027a086d65bc6c,not too big and not too small,True,1.0,...,1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511
4,Tuned BERT-1seq Large Cased (single model),tunedbert-1seqlargecased(singlemodel),Tuned BERT-1seq Large Cased (single model),0xf776d7,John,Amazon,5dd465dacc027a086d65bc6c,not too big and not too small,True,1.0,...,1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511


In [33]:
df_answers = pd.read_csv('answers.csv')
df_distinct_answers = pd.read_csv('distinct_answers.csv')
df_distinct_context = pd.read_csv('distinct_context.csv')
df_merged_answers = pd.read_csv('merged_answers.csv')
df_merged_answers_and_context = pd.read_csv('merged_answers_and_context.csv')
df_pred_answers_context = pd.read_csv('pred_answers_context.csv')
df_pred = pd.DataFrame(load_data(PREDICTION_PATH + '/all_predictions.csv'))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [29]:
df_distinct_context[:5]

Unnamed: 0,index,test_set,context,sentence_count,word_count,word_character_count,words,syllables_per_word,polysyllable_count,avg_word_length,avg_sentence_length_in_words,context_character_count,avg_sentence_length_in_characters,flesch-kincaid_grade_level,coleman-liau,gunning-fog,automated-readability
0,0,amazon_reviews_v1,It's a very nice holder - not too big and not ...,4,47,175,"['It', ""'s"", 'a', 'very', 'nice', 'holder', 'n...",1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511
1,15,amazon_reviews_v1,"First of all, this thing is freakin' awesome. ...",9,159,630,"['First', 'of', 'all', 'this', 'thing', 'is', ...",1.207547,25,3.962264,17.666667,803,89.222222,5.549057,5.822642,13.355975,11.190314
2,27,amazon_reviews_v1,"The presto my pod makes, to me, a very weak an...",8,117,431,"['The', 'presto', 'my', 'pod', 'makes', 'to', ...",1.17094,18,3.683761,14.625,551,68.875,3.930844,3.836581,12.003846,8.063782
3,39,amazon_reviews_v1,This product takes 10 minutes to setup 4. It i...,14,123,458,"['This', 'product', 'takes', '10', 'minutes', ...",1.146341,17,3.723577,8.785714,591,42.214286,1.363258,2.725528,9.042741,5.593833
4,51,amazon_reviews_v1,I have always kept a dustbuster in my kitchen ...,15,182,749,"['I', 'have', 'always', 'kept', 'a', 'dustbust...",1.214286,31,4.115385,12.133333,946,63.066667,3.470571,5.958901,11.66652,9.118315


In [35]:
df_test_sets = df_distinct_context.groupby(['test_set'], as_index=False).agg({'context': ' '.join})

In [36]:
df_test_sets['freqdist'] = df_test_sets.apply(lambda row: nltk.FreqDist(nltk.tokenize.word_tokenize(row['context'])), axis = 1)
df_test_sets['freqdist_lower'] = df_test_sets.apply(lambda row: nltk.FreqDist(nltk.tokenize.word_tokenize(row['context'].lower())), axis = 1)



In [39]:
df_test_sets['lexical_diversity'] = df_test_sets.apply(lambda row: len(set(nltk.tokenize.word_tokenize(row['context'])))/len(nltk.tokenize.word_tokenize(row['context'])), axis = 1)
df_test_sets['lexical_diversity_lower'] = df_test_sets.apply(lambda row: len(set(nltk.tokenize.word_tokenize(row['context'].lower())))/len(nltk.tokenize.word_tokenize(row['context'].lower())), axis = 1)

In [52]:
df_distinct_context['nltk_sentence_count'] = df_distinct_context.apply(lambda row: len(nltk.FreqDist(nltk.tokenize.sent_tokenize(row['context'].lower()))), axis=1)

df_distinct_context

In [69]:
with open(TEST_SETS_PATH + '/train-v2.0.json') as f:
    s_json = json.load(f)

In [70]:
full_squad_text = ' '.join([x['context'] for y in s_json['data'] for x in y['paragraphs']])

In [85]:
squad_freqdist = nltk.FreqDist(nltk.tokenize.word_tokenize(full_squad_text))
squad_freqdist_lower = nltk.FreqDist(nltk.tokenize.word_tokenize(full_squad_text.lower()))

In [74]:
# SQuAD lexical diversity
print('SQuAD Training Lexical Diversity:', len(set(nltk.tokenize.word_tokenize(full_squad_text)))/len(nltk.tokenize.word_tokenize(full_squad_text)))

SQuAD Training Lexical Diversity: 0.043178504959439146


In [73]:
print('SQuAD Training Lexical Diversity (lower case):', len(set(nltk.tokenize.word_tokenize(full_squad_text.lower())))/len(nltk.tokenize.word_tokenize(full_squad_text.lower())))

SQuAD Training Lexical Diversity (lower case): 0.03877918893738047


In [91]:
df_test_sets['training_overlap'] = df_test_sets.apply(lambda row: len(set.intersection( set(row['freqdist'].keys()), set(squad_freqdist.keys()) ))/len(set(row['freqdist'].keys())), axis = 1)
df_test_sets['training_overlap_lower'] = df_test_sets.apply(lambda row: len(set.intersection( set(row['freqdist_lower'].keys()), set(squad_freqdist_lower.keys() )))/len(set(row['freqdist_lower'].keys())), axis = 1)

In [94]:
df_test_sets['vocab_size'] = df_test_sets.apply(lambda row: len(row['freqdist']), axis = 1)
df_test_sets['vocab_size_lower'] = df_test_sets.apply(lambda row: len(row['freqdist_lower']), axis = 1)

In [105]:
df_test_sets['distinct_in_test'] = df_test_sets.apply(lambda row: set(row['freqdist'].keys()).difference(set(squad_freqdist.keys())), axis = 1)
df_test_sets['distinct_in_test_lower'] = df_test_sets.apply(lambda row: set(row['freqdist_lower'].keys()).difference(set(squad_freqdist_lower.keys())), axis = 1)

In [109]:
df_test_sets['distinct_in_test_size'] = df_test_sets.apply(lambda row: len(row['distinct_in_test']), axis = 1)
df_test_sets['distinct_in_test_size_lower'] = df_test_sets.apply(lambda row: len(row['distinct_in_test_lower']), axis = 1)

In [116]:
df_test_sets['pct_distinct'] = df_test_sets.apply(lambda row: row['distinct_in_test_size']/row['vocab_size'] , axis = 1)
df_test_sets['pct_distinct_lower'] = df_test_sets.apply(lambda row: row['distinct_in_test_size_lower']/row['vocab_size_lower'], axis = 1)

In [176]:
with open('datafiles\wordsapi_sample.json', encoding='utf8') as f:
    wordsapi_dict = json.load(f)

def getwords(word_set):
    results = []
    word_set = set(word_set).difference(set(wordsapi_dict.keys()))
    for w in word_set:
        try:
            response = requests.request("GET", url.format(w), headers=headers)
            result_json = response.json()
            result_json['word'] = w
            results.append(result_json)
        except Exception as e:
            response = {'word': w, 'success':  False, 'message': str(e)}
            print(w)
    return results

In [177]:
for test_set in df_test_sets['test_set'].unique():
    word_list = list(df_test_sets[df_test_sets['test_set'] == test_set]['distinct_in_test_lower'].values[0])
    results = getwords(word_list)
    with open('datafiles/wordsapi_{}.json'.format(test_set), 'w', encoding='utf8') as f_out:
        json.dump(results, f_out)

11/10/13
8/23/09
on//off
blankets/quilts/comforters
5/10/2014
1/21/10the
12/27/12
/an
travel/back-up/whatever
1/19/2011i
restaurant/bar/commercial
dishwasher/microwave/college
11/16/11i
roll/slide/move
//www.amazon.com/oreck-professional-purifier-airpb-technology/dp/b002vsafp4/ref=sr_1_7
traffic/petstains/hair/dirt
money.08/22/13not
website.09/10/13after
5/31/13the
12/17/12
up/down/straight
12/14/13
on/off/tare
9/4/13
husbands/wives/significant
sewing/crochet/art
12/03/2011
7/1/2014
10/10/12
side/back/stomach
papers/purses/lunchboxes
1/4/14
07/22/13
usage.11/25/13
//www.amazon.com/bob-wardens-slow-food-fast/dp/0984188711/ref=sr_1_1
//www.amazon.com/calphalon-unison-nonstick-10-piece-cookware/dp/b0028s7r58/ref=cm_cr_pr_product_topthese
cracking/breaking/explosions
1/10/2013i
12/27/11
//smile.amazon.com/gp/product/b000hm83x2/ref=oh_details_o02_s00_i00
08/02/13i
3/28/2014
tang/shoulder/blade
11/15/13
//www.amazon.com/gp/product/b0091xnl0i/ref=oh_details_o01_s00_i00
luck9/21/13
//www.amazo

In [143]:
list(df_test_sets[df_test_sets['test_set'] == 'nyt_v1']['distinct_in_test_lower'].values[0])

['two-bath',
 'outspoken.',
 'c.i.a.-sponsored',
 'mozaffor',
 'ethan',
 'chipper',
 'non-american',
 '88-page',
 'fingerpicked',
 'mean.',
 'guessing',
 'limped',
 'retires',
 'florida.',
 'pulpy',
 'h.d.p',
 'galeotti',
 'mutates',
 'fair-market',
 'acar',
 'gilbertville',
 'mcconnell',
 'happy-slappy',
 'xiaochuan',
 'marktwainhouse.org',
 'most-searched',
 'humanize',
 'malty',
 'telegraphing',
 'meaty',
 'well-trodden',
 'holtby',
 'deavere',
 'nonwhite',
 'freda',
 'mermelstein',
 'flagstaff',
 'djemel',
 'pre-recession',
 'balducci',
 'crazy.',
 'flirty',
 'andino',
 'bilkent',
 'birthrates',
 'coy',
 'recker',
 'hothouse',
 'worst-hit',
 'trivially',
 'sugary',
 'ibargüen',
 'message.',
 'chernow',
 'pask',
 'expressionless',
 'g.m.',
 'aksyonov',
 'serena',
 'disdainful',
 'layup',
 'inception.',
 'desmondfishlibrary.org',
 'open-water',
 'greater.',
 'carmona',
 'bread-and-butter',
 'goan',
 'snows',
 'daata',
 'bretas',
 'nicolai',
 'mondella',
 '€100',
 'catsoulis',
 'mind-

In [120]:
df_test_sets.iloc[2]['distinct_in_test_lower']

{'two-bath',
 'outspoken.',
 'c.i.a.-sponsored',
 'mozaffor',
 'ethan',
 'chipper',
 'non-american',
 '88-page',
 'fingerpicked',
 'mean.',
 'guessing',
 'limped',
 'retires',
 'florida.',
 'pulpy',
 'h.d.p',
 'galeotti',
 'mutates',
 'fair-market',
 'acar',
 'gilbertville',
 'mcconnell',
 'happy-slappy',
 'xiaochuan',
 'marktwainhouse.org',
 'most-searched',
 'humanize',
 'malty',
 'telegraphing',
 'meaty',
 'well-trodden',
 'holtby',
 'deavere',
 'nonwhite',
 'freda',
 'mermelstein',
 'flagstaff',
 'djemel',
 'pre-recession',
 'balducci',
 'crazy.',
 'flirty',
 'andino',
 'bilkent',
 'birthrates',
 'coy',
 'recker',
 'hothouse',
 'worst-hit',
 'trivially',
 'sugary',
 'ibargüen',
 'message.',
 'chernow',
 'pask',
 'expressionless',
 'g.m.',
 'aksyonov',
 'serena',
 'disdainful',
 'layup',
 'inception.',
 'desmondfishlibrary.org',
 'open-water',
 'greater.',
 'carmona',
 'bread-and-butter',
 'goan',
 'snows',
 'daata',
 'bretas',
 'nicolai',
 'mondella',
 '€100',
 'catsoulis',
 'mind-

In [99]:
squad_freqdist.N()

2553817

In [131]:
len(set(squad_freqdist_lower.keys()))

98853

In [130]:
len([x for x in squad_freqdist_lower.keys() if x in wordsapi_dict.keys()])

3773

In [145]:
set.intersection(set(squad_freqdist_lower.keys()), set(wordsapi_dict.keys())))

3773

In [161]:
import requests

url = "https://wordsapiv1.p.rapidapi.com/words/{}"

headers = {
    'x-rapidapi-host': "wordsapiv1.p.rapidapi.com",
    'x-rapidapi-key': ""
    }

response = requests.request("GET", url.format('food'), headers=headers)

In [162]:
response.json()

{'word': 'food',
 'results': [{'definition': 'any solid substance (as opposed to liquid) that is used as a source of nourishment',
   'partOfSpeech': 'noun',
   'synonyms': ['solid food'],
   'typeOf': ['solid'],
   'hasTypes': ['garden truck',
    'green goods',
    'green groceries',
    'health food',
    'junk food',
    'leftovers',
    'loaf',
    'yoghurt',
    'convenience food',
    'meat',
    'cheese',
    'dika bread',
    'chocolate',
    'baked goods',
    'fresh foods',
    'breakfast food',
    'butter',
    'fresh food',
    'coconut',
    'coconut meat',
    'yogurt',
    'pasta',
    'alimentary paste',
    'produce',
    'fish',
    'yoghourt',
    'seafood',
    'slop'],
   'partOf': ['nutrient'],
   'examples': ['food and drink']},
  {'definition': 'anything that provides mental stimulus for thinking',
   'partOfSpeech': 'noun',
   'synonyms': ['food for thought', 'intellectual nourishment'],
   'typeOf': ['mental object', 'content', 'cognitive content'],
   'hasT

In [163]:
results

[{'success': False, 'message': 'word not found'},
 {'success': False, 'message': 'word not found'},
 {'success': False, 'message': 'word not found'},
 {'success': False, 'message': 'word not found'},
 {'success': False, 'message': 'word not found'}]