# Start Stanford CoreNLP server
`java -Xmx16g -cp C:\stanford-corenlp-latest\stanford-corenlp-4.0.0\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9002 -timeout 600 -threads 5 -maxCharLength 100000 -quiet False -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,coref`

In [1]:
import string
import re
import collections
import csv
import pandas as pd
import numpy as np
import requests
import os
import json
from hyphen import Hyphenator

import dask.dataframe as dd
import multiprocessing

import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

import seaborn as sns

%matplotlib inline

import stanfordnlp
from stanfordnlp.server import CoreNLPClient

# Uncomment if needed to fix this error:
# OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

NLP_PORT = 9002

In [2]:
PREDICTION_PATH = './predictions'
TEST_SETS_PATH = './test_sets'
MODEL_EVALS_URL = 'https://squad-model-evals.s3-us-west-2.amazonaws.com/model_db.json'

#SET_NAMES = ['Amazon', 'Reddit', 'New-Wiki', 'NYT', 'dev-v1.1']
SET_NAMES = ['Amazon', 'Reddit', 'New-Wiki', 'NYT']

In [3]:
def fetch_eval_file(eval_file_path, model_evals_url, overwrite=False):
    if (not os.path.exists(eval_file_path)) or overwrite:
        r = requests.get(model_evals_url)
                        
        with open(eval_file_path, 'w', encoding='utf-8') as outfile:
            outfile.write(r.text)

    else:
        print('File Exists')
    
    

def write_output(output_file_path, list_to_write):
    fields = list_to_write[0].keys()
    
    with open(output_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.DictWriter(csv_file, 
                                    fieldnames=fields,
                                    delimiter=',', 
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL )
        csv_writer.writeheader()
        csv_writer.writerows(list_to_write)

def parse_predictions(prediction_file_path, download=False):
    
    with open(prediction_file_path) as f:
      predictions = json.load(f)


    pred_list_test = [{ 'model_display_name': x['name'], 
      'model_name': x['metadata']['name'], 
      'description': x['metadata']['description'], 
      'uuid': x['metadata']['uuid'],
      'testbed': x['testbed'],
      'predictions': x['predictions']

     } for x in predictions]

    pred_list = []

    for r in predictions:

      display_name = r['name']
      model_name = r['metadata']['name']
      description = r['metadata']['description']
      uuid = r['metadata']['uuid']
      testbed = r['testbed']

      for k1, v1 in r['predictions'].items():
        if k1 in (SET_NAMES):
          if 'bundle' in v1.keys():
            test_set = k1
            bundle = v1['bundle']

            for k2, v2 in v1['data'].items():
              qid = k2
              predicted_answer = v2
              exact_match = v1['scores'][qid]['exact_match']
              f1 = v1['scores'][qid]['f1']

              pred_list.append( {
                'display_name': display_name,
                'model_name': model_name,
                'description': description,
                'uuid': uuid,
                'testbed': testbed,
                'test_set': test_set,
                'qid': qid,
                'predicted_answer': predicted_answer,
                'exact_match': exact_match,
                'f1': float(f1)
              })
   
    return pred_list

def load_data(input_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        return [r for r in csv_reader]

def parse_answers(answer_file_path):
    test_set_answers = [a for a in os.listdir(answer_file_path) if not os.path.isdir('/'.join([answer_file_path, a]))]
    answers_list = []
    
    for f in test_set_answers:
      with open('/'.join([TEST_SETS_PATH, f])) as fh:
          test_set = f.split('.')[0]
          
          answers = json.load(fh)['data']
          for x in answers:
              title = x['title']
    
              for p in x['paragraphs']:
                  context = p['context']
    
                  for qa in p['qas']:
                      question = qa['question']
                      question_id = qa['id']
    
                      for a in qa['answers']:
                          answers_list.append(
                                  {
                                      'test_set': test_set,
                                      'question_id': question_id,
                                      'title': title,
                                      'context': context,
                                      'question_text': question,
                                      'answer_text': a['text'],
                                      'answer_start': a['answer_start']
                                  }
                              )
    return answers_list

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
  
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()

def compute_exact(question_id, predicted_answer, all_answers):
    gold_answers = [normalize_answer(x['answer_text']) for x in all_answers if x['question_id'] == question_id]
    return max((int(normalize_answer(predicted_answer) == a) for a in gold_answers))

def compute_f1(question_id, predicted_answer, all_answers):
    gold_toks = [get_tokens(x['answer_text']) for x in all_answers if x['question_id'] == question_id]
    pred_toks = get_tokens(predicted_answer)
    
    f1s = []
  
    for answer_toks in gold_toks:
        common = collections.Counter(answer_toks) & collections.Counter(pred_toks)
        num_same = sum(common.values())
      
        if len(answer_toks) == 0 or len(pred_toks) == 0:
            # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
            f1s.append(float(int(answer_toks == pred_toks)))
            continue
        if num_same == 0:
            f1s.append(0.0)
            continue
            
        precision = 1.0 * num_same / len(pred_toks)
        recall = 1.0 * num_same / len(answer_toks)
        f1 = (2 * precision * recall) / (precision + recall)

        f1s.append(f1)

    return float(max(f1s))

def print_answer(qid, all_answers):
    question = [q for q in all_answers if q['question_id'] == qid]
    answers = [a['answer_text'] for a in question]
    
    if question:
        print('Test Set:', question[0]['test_set'])
        print('Context:', question[0]['context'])
        print('Question:', question[0]['question_text'])
        print('Answers:', answers)

In [4]:
# Test Server
try:
    txt = 'This is a test sentence. So is this.'
    with CoreNLPClient(endpoint='http://localhost:{}'.format(NLP_PORT), start_server=False, timeout=30000) as client:
        ann = client.annotate(txt)
        print('Server running. Found {} sentences'.format(len(ann.sentence)))
except Exception as e:
    print(e)

Server running. Found 2 sentences


In [5]:
# Download the model_db.json file that contains all the pre-evaluated and scored questions
# from the previous groups' work, if it doesn't exist yet.

fetch_eval_file(PREDICTION_PATH + '/model_db.json', MODEL_EVALS_URL, overwrite=False)

File Exists


In [6]:
# If predictions and/or answer files don't exist, uncomment these to recreate them

predictions = parse_predictions(PREDICTION_PATH + '/model_db.json')
answers = parse_answers(TEST_SETS_PATH)

write_output(PREDICTION_PATH + '/all_predictions.csv', predictions)
write_output(PREDICTION_PATH + '/all_answers.csv', answers)

In [7]:
# Otherwise, load from files
predictions = load_data(PREDICTION_PATH + '/all_predictions.csv')
answers = load_data(PREDICTION_PATH + '/all_answers.csv')   

In [8]:
# Load into Pandas dataframes

df_pred = pd.DataFrame(predictions)
df_answers = pd.DataFrame(answers)

df_pred = df_pred.astype({'f1': 'float'})

In [9]:
df_answers['is_numeric'] = df_answers.apply(lambda row: row['answer_text'].isnumeric(), axis=1)
df_answers[df_answers['is_numeric']]

Unnamed: 0,test_set,question_id,title,context,question_text,answer_text,answer_start,is_numeric
66,amazon_reviews_v1,5dd4661fcc027a086d65bc77,Amazon_Reviews_530,"i wanted an electric kettle, but landed up ord...",How many irritations are there?,2,169,True
67,amazon_reviews_v1,5dd4661fcc027a086d65bc77,Amazon_Reviews_530,"i wanted an electric kettle, but landed up ord...",How many irritations are there?,2,169,True
68,amazon_reviews_v1,5dd4661fcc027a086d65bc77,Amazon_Reviews_530,"i wanted an electric kettle, but landed up ord...",How many irritations are there?,2,169,True
151,amazon_reviews_v1,5dd4673dcc027a086d65bcec,Amazon_Reviews_295,I ordered these sheets and must say was a bit ...,what is the thread count on the sheets?,1500,91,True
152,amazon_reviews_v1,5dd4673dcc027a086d65bcec,Amazon_Reviews_295,I ordered these sheets and must say was a bit ...,what is the thread count on the sheets?,1500,126,True
...,...,...,...,...,...,...,...,...
106622,reddit_v1,5d9c9bbd8ae5305bc982f410,Filtered_Reddit_Comments,Sales Professional Development Help? Hello All...,How many people does the company that I work f...,8,302,True
106688,reddit_v1,5d9ca3b18ae5305bc982f454,Filtered_Reddit_Comments,"David Pastrnak's WJC is over. 19 SOG, 1 G, 3 A...",How many did he play in?,4,99,True
106689,reddit_v1,5d9ca3b18ae5305bc982f454,Filtered_Reddit_Comments,"David Pastrnak's WJC is over. 19 SOG, 1 G, 3 A...",How many did he play in?,4,99,True
106760,reddit_v1,5d9cae448ae5305bc982f492,Filtered_Reddit_Comments,[AMA Request] A scalper / secondary-market tic...,What's the total number of questions the autho...,5,60,True


In [10]:
nlp = stanfordnlp.Pipeline(processors='tokenize', use_gpu=True)

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': 'C:\\Users\\justin.stanley\\stanfordnlp_resources\\en_ewt_models\\en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


In [58]:
def get_all_stanford_metrics(txt):
    subtree_value = ''
    ner = '_NO_NER'
    sentence_count = 0
    word_count = 0 
    character_count = 0
    
    try:
        with CoreNLPClient(endpoint='http://localhost:{}'.format(NLP_PORT), start_server=False, timeout=30000) as client:

            ann = client.annotate(txt)
            
            sentence_count = len(ann.sentence)
            words = [x.word for s in ann.sentence for x in s.token if x.word not in string.punctuation]
            word_count = len(words)
            character_count = sum([len(x) for x in words])
            
            sentence = ann.sentence[0]
            if sentence.mentions:
                ner = sentence.mentions[0].entityType
            
            constituency_parse = sentence.parseTree
            subtree_value = constituency_parse.child[0].value
        
        return subtree_value, ner, sentence_count, word_count, character_count
        
    except Exception as e:
        return e.args[0],e.args[0], e.args[0], e.args[0],e.args[0]
    
def get_stanford_counts(txt):
    sentence_count = 0
    word_count = 0 
    character_count = 0
    
    try:
        doc = nlp(txt)
        sentence_count = len(doc.sentences)
        words = [w.text for s in doc.sentences for w in s.words if w.text not in string.punctuation]
        word_count = len(words)
        character_count = sum([len(x) for x in words])
        
        return sentence_count, word_count, character_count, words
        
    except Exception as e:
        return e.args[0], e.args[0], e.args[0]


In [12]:
df_distinct_answers = pd.DataFrame({'answer_text': df_answers['answer_text'].unique()})
df_distinct_answers[['first_parse', 'first_ner', 'sentence_count', 'word_count', 'word_character_count', ]] = df_distinct_answers.apply(lambda row: get_all_stanford_metrics(row['answer_text']), axis=1, result_type='expand')

In [13]:
df_distinct_answers.fillna(value = {'first_ner':'_NO_NER'}).groupby(['first_ner']).count()

Unnamed: 0_level_0,answer_text,first_parse,sentence_count,word_count,word_character_count
first_ner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CAUSE_OF_DEATH,386,386,386,386,386
CITY,581,581,581,581,581
COUNTRY,604,604,604,604,604
CRIMINAL_CHARGE,166,166,166,166,166
CoreNLP request timed out. Your document may be too long.,1,1,1,1,1
DATE,1934,1934,1934,1934,1934
DURATION,1232,1232,1232,1232,1232
HANDLE,6,6,6,6,6
IDEOLOGY,264,264,264,264,264
LOCATION,610,610,610,610,610


In [82]:
df_distinct_context = df_answers[['test_set','context']].drop_duplicates().reset_index()

In [83]:
df_distinct_context[['sentence_count', 'word_count', 'word_character_count', 'words']] = df_distinct_context.apply(lambda row: get_stanford_counts(row['context']), axis=1, result_type='expand')

In [84]:
df_distinct_context[:5]

Unnamed: 0,index,test_set,context,sentence_count,word_count,word_character_count,words
0,0,amazon_reviews_v1,It's a very nice holder - not too big and not ...,4,47,175,"[It, 's, a, very, nice, holder, not, too, big,..."
1,15,amazon_reviews_v1,"First of all, this thing is freakin' awesome. ...",9,159,630,"[First, of, all, this, thing, is, freakin', aw..."
2,27,amazon_reviews_v1,"The presto my pod makes, to me, a very weak an...",8,117,431,"[The, presto, my, pod, makes, to, me, a, very,..."
3,39,amazon_reviews_v1,This product takes 10 minutes to setup 4. It i...,14,123,458,"[This, product, takes, 10, minutes, to, setup,..."
4,51,amazon_reviews_v1,I have always kept a dustbuster in my kitchen ...,15,182,749,"[I, have, always, kept, a, dustbuster, in, my,..."


In [16]:
syll_df = dd.from_pandas(df_distinct_context, npartitions = 2*multiprocessing.cpu_count()) \
            .map_partitions(lambda df: df.apply(lambda row: [max(1, len(Hyphenator('en_US').syllables(x))) if len(str(x)) < 100 else -1 for x in row['words'] ], axis = 1)) \
            .compute(scheduler='processes')

In [17]:
df_distinct_context['syllables_per_word'] = syll_df

In [18]:
df_distinct_context['polysyllable_count'] = df_distinct_context.apply(lambda row: len([x for x in row['syllables_per_word'] if x > 1]), axis = 1)
df_distinct_context['avg_word_length'] = df_distinct_context.apply(lambda row: sum([len(x) for x in row['words']])/row['word_count'], axis = 1)

In [19]:
df_distinct_context['avg_sentence_length_in_words'] = df_distinct_context['word_count']/df_distinct_context['sentence_count']
df_distinct_context['context_character_count'] = df_distinct_context.apply(lambda row: len(row['context']), axis=1)
df_distinct_context['avg_sentence_length_in_characters'] = df_distinct_context['context_character_count']/df_distinct_context['sentence_count']
df_distinct_context['syllables_per_word'] = df_distinct_context.apply(lambda row: sum([x for x in row['syllables_per_word'] if x > 0])/ len([x for x in row['syllables_per_word'] if x > 0]) , axis=1)
df_distinct_context['flesch-kincaid_grade_level'] = df_distinct_context.apply(lambda row: (0.39 * row['avg_sentence_length_in_words']) + (11.8 * row['syllables_per_word']) - 15.59, axis=1)

In [20]:
df_distinct_context['coleman-liau'] = df_distinct_context.apply(lambda row: (0.0588 * (row['avg_word_length']) * 100) - (0.296 * (100/row['avg_sentence_length_in_words'])) - 15.8, axis=1)
df_distinct_context['gunning-fog'] = df_distinct_context.apply(lambda row: 0.4 * ((row['word_count'] / row['sentence_count']) + ((row['polysyllable_count'] / row['word_count']) * 100)), axis=1)
df_distinct_context['automated-readability'] = df_distinct_context.apply(lambda row: 4.71 * (row['context_character_count'] / row['word_count']) + 0.5 * (row['word_count'] / row['sentence_count']) - 21.43, axis=1)

In [21]:
df_merged_answers = df_answers.merge(df_distinct_answers, on=['answer_text'])   

In [22]:
df_merged_answers['is_numeric'] = df_merged_answers.apply(lambda row: row['answer_text'].isnumeric(), axis=1)

In [23]:
df_merged_answers_and_context = df_merged_answers.merge(df_distinct_context, on=['context'])   

In [24]:
df_pred_answers_context = df_pred.merge(df_merged_answers_and_context, left_on=['qid'], right_on=['question_id'])

In [25]:
df_pred_answers_context['exact_match'] = df_pred_answers_context['exact_match'].map({'True':True, 'False':False})

In [26]:
df_answers.to_csv('answers.csv', index=False)
df_distinct_answers.to_csv('distinct_answers.csv', index=False)
df_distinct_context.to_csv('distinct_context.csv', index=False)
df_merged_answers.to_csv('merged_answers.csv', index=False)
df_merged_answers_and_context.to_csv('merged_answers_and_context.csv', index = False)
df_pred_answers_context.to_csv('pred_answers_context.csv', index=False)

In [27]:
df_pred_answers_context[:5]

Unnamed: 0,display_name,model_name,description,uuid,testbed,test_set,qid,predicted_answer,exact_match,f1,...,syllables_per_word,polysyllable_count,avg_word_length,avg_sentence_length_in_words,context_character_count,avg_sentence_length_in_characters,flesch-kincaid_grade_level,coleman-liau,gunning-fog,automated-readability
0,XLNET-123 (single model),xlnet-123(singlemodel),XLNET-123 (single model),0x8d330a,John,Amazon,5dd465dacc027a086d65bc6c,not too big and not too small,True,1.0,...,1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511
1,XLNET-123 (single model),xlnet-123(singlemodel),XLNET-123 (single model),0x8d330a,John,Amazon,5dd465dacc027a086d65bc6c,not too big and not too small,True,1.0,...,1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511
2,XLNET-123 (single model),xlnet-123(singlemodel),XLNET-123 (single model),0x8d330a,John,Amazon,5dd465dacc027a086d65bc6c,not too big and not too small,True,1.0,...,1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511
3,Tuned BERT-1seq Large Cased (single model),tunedbert-1seqlargecased(singlemodel),Tuned BERT-1seq Large Cased (single model),0xf776d7,John,Amazon,5dd465dacc027a086d65bc6c,not too big and not too small,True,1.0,...,1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511
4,Tuned BERT-1seq Large Cased (single model),tunedbert-1seqlargecased(singlemodel),Tuned BERT-1seq Large Cased (single model),0xf776d7,John,Amazon,5dd465dacc027a086d65bc6c,not too big and not too small,True,1.0,...,1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511


In [28]:
df_answers = pd.read_csv('answers.csv')
df_distinct_answers = pd.read_csv('distinct_answers.csv')
df_distinct_context = pd.read_csv('distinct_context.csv')
df_merged_answers = pd.read_csv('merged_answers.csv')
df_merged_answers_and_context = pd.read_csv('merged_answers_and_context.csv')
df_pred_answers_context = pd.read_csv('pred_answers_context.csv')
df_pred = pd.DataFrame(load_data(PREDICTION_PATH + '/all_predictions.csv'))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [29]:
df_distinct_context[:5]

Unnamed: 0,index,test_set,context,sentence_count,word_count,word_character_count,words,syllables_per_word,polysyllable_count,avg_word_length,avg_sentence_length_in_words,context_character_count,avg_sentence_length_in_characters,flesch-kincaid_grade_level,coleman-liau,gunning-fog,automated-readability
0,0,amazon_reviews_v1,It's a very nice holder - not too big and not ...,4,47,175,"['It', ""'s"", 'a', 'very', 'nice', 'holder', 'n...",1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511
1,15,amazon_reviews_v1,"First of all, this thing is freakin' awesome. ...",9,159,630,"['First', 'of', 'all', 'this', 'thing', 'is', ...",1.207547,25,3.962264,17.666667,803,89.222222,5.549057,5.822642,13.355975,11.190314
2,27,amazon_reviews_v1,"The presto my pod makes, to me, a very weak an...",8,117,431,"['The', 'presto', 'my', 'pod', 'makes', 'to', ...",1.17094,18,3.683761,14.625,551,68.875,3.930844,3.836581,12.003846,8.063782
3,39,amazon_reviews_v1,This product takes 10 minutes to setup 4. It i...,14,123,458,"['This', 'product', 'takes', '10', 'minutes', ...",1.146341,17,3.723577,8.785714,591,42.214286,1.363258,2.725528,9.042741,5.593833
4,51,amazon_reviews_v1,I have always kept a dustbuster in my kitchen ...,15,182,749,"['I', 'have', 'always', 'kept', 'a', 'dustbust...",1.214286,31,4.115385,12.133333,946,63.066667,3.470571,5.958901,11.66652,9.118315


In [33]:
df_test = df_distinct_context.groupby(['test_set'], as_index=False).agg({'context': ' '.join})

In [44]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\justin.stanley\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [50]:
df_test['freqdist'] = df_test.apply(lambda row: nltk.FreqDist(nltk.tokenize.word_tokenize(row['context'])), axis = 1)
df_test['freqdist_lower'] = df_test.apply(lambda row: nltk.FreqDist(nltk.tokenize.word_tokenize(row['context'].lower())), axis = 1)

In [51]:
df_test

Unnamed: 0,test_set,context,freqdist,freqdist_lower
0,amazon_reviews_v1,It's a very nice holder - not too big and not ...,"{'It': 1542, ''s': 1705, 'a': 7881, 'very': 10...","{'it': 8958, ''s': 1710, 'a': 8004, 'very': 11..."
1,new_wiki_v1,The Monastic Brotherhood consists of the celib...,"{'The': 1835, 'Monastic': 1, 'Brotherhood': 1,...","{'the': 14656, 'monastic': 3, 'brotherhood': 8..."
2,nyt_v1,Mattingly’s election to baseball’s Hall of Fam...,"{'Mattingly': 8, '’': 4322, 's': 2837, 'electi...","{'mattingly': 8, '’': 4322, 's': 2841, 'electi..."
3,reddit_v1,Help with aftermarket mirrors for my boyfriend...,"{'Help': 99, 'with': 2100, 'aftermarket': 5, '...","{'help': 599, 'with': 2153, 'aftermarket': 5, ..."


In [52]:
df_distinct_context['nltk_sentence_count'] = df_distinct_context.apply(lambda row: len(nltk.FreqDist(nltk.tokenize.sent_tokenize(row['context'].lower()))), axis=1)

In [53]:
df_distinct_context[:5]

Unnamed: 0,index,test_set,context,sentence_count,word_count,word_character_count,words,syllables_per_word,polysyllable_count,avg_word_length,avg_sentence_length_in_words,context_character_count,avg_sentence_length_in_characters,flesch-kincaid_grade_level,coleman-liau,gunning-fog,automated-readability,nltk_sentence_count
0,0,amazon_reviews_v1,It's a very nice holder - not too big and not ...,4,47,175,"['It', ""'s"", 'a', 'very', 'nice', 'holder', 'n...",1.106383,4,3.723404,11.75,228,57.0,2.047819,3.574468,8.104255,7.293511,4
1,15,amazon_reviews_v1,"First of all, this thing is freakin' awesome. ...",9,159,630,"['First', 'of', 'all', 'this', 'thing', 'is', ...",1.207547,25,3.962264,17.666667,803,89.222222,5.549057,5.822642,13.355975,11.190314,8
2,27,amazon_reviews_v1,"The presto my pod makes, to me, a very weak an...",8,117,431,"['The', 'presto', 'my', 'pod', 'makes', 'to', ...",1.17094,18,3.683761,14.625,551,68.875,3.930844,3.836581,12.003846,8.063782,4
3,39,amazon_reviews_v1,This product takes 10 minutes to setup 4. It i...,14,123,458,"['This', 'product', 'takes', '10', 'minutes', ...",1.146341,17,3.723577,8.785714,591,42.214286,1.363258,2.725528,9.042741,5.593833,12
4,51,amazon_reviews_v1,I have always kept a dustbuster in my kitchen ...,15,182,749,"['I', 'have', 'always', 'kept', 'a', 'dustbust...",1.214286,31,4.115385,12.133333,946,63.066667,3.470571,5.958901,11.66652,9.118315,13


In [100]:
txt = df_distinct_context.iloc[1519]['context']

In [101]:
nltk.tokenize.sent_tokenize(txt)

['...first off dont even attempt to fry eggsin this skillet for at least 2 weeks of using....when i recived mine i washed well in hot water, even tho it seasoned your gonna wanna do it theright way...after washing put on top stove to dry,then cooked bacon in it, you are gonna wanna useanimal fat as much as possiable.....it did stick but thats toobe expected with first time use, then i towel dryedthe extra grease and added lard....put in oven at 500degrees, expect some smoking, just have fans ready, for1 hr...turn oven off, i did this like 3-4 times...thencooked fried potatoes with onions ,worked great, friedan egg and it slid off pan...great pan, just got dutch oven too...']

In [68]:
with CoreNLPClient(endpoint='http://localhost:{}'.format(NLP_PORT), start_server=False, timeout=30000000) as client:

    ann = client.annotate(txt)

    sentence_count = len(ann.sentence)
    words = [x.word for s in ann.sentence for x in s.token if x.word not in string.punctuation]
    word_count = len(words)
    character_count = sum([len(x) for x in words])

    sentence = ann.sentence[0]
    if sentence.mentions:
        ner = sentence.mentions[0].entityType

    constituency_parse = sentence.parseTree
    subtree_value = constituency_parse.child[0].value

    

In [79]:
for s in ann.sentence:
    print(s.text)







In [103]:
txt

'...first off dont even attempt to fry eggsin this skillet for at least 2 weeks of using....when i recived mine i washed well in hot water, even tho it seasoned your gonna wanna do it theright way...after washing put on top stove to dry,then cooked bacon in it, you are gonna wanna useanimal fat as much as possiable.....it did stick but thats toobe expected with first time use, then i towel dryedthe extra grease and added lard....put in oven at 500degrees, expect some smoking, just have fans ready, for1 hr...turn oven off, i did this like 3-4 times...thencooked fried potatoes with onions ,worked great, friedan egg and it slid off pan...great pan, just got dutch oven too...'

In [104]:
get_all_stanford_metrics(txt)

('S', 'ORDINAL', 1, 140, 551)

In [105]:
doc = nlp(txt)
# sentence_count = len(doc.sentences)
# words = [w.text for s in doc.sentences for w in s.words if w.text not in string.punctuation]
# word_count = len(words)
# character_count = sum([len(x) for x in words])


In [108]:
len(doc.sentences)

1

In [107]:
for i, s in enumerate(doc.sentences):
    print('Sentence {}'.format(i))
    s.print_words()


Sentence 0
<Word index=1;text=...>
<Word index=2;text=first>
<Word index=3;text=off>
<Word index=4;text=do>
<Word index=5;text=nt>
<Word index=6;text=even>
<Word index=7;text=attempt>
<Word index=8;text=to>
<Word index=9;text=fry>
<Word index=10;text=eggsin>
<Word index=11;text=this>
<Word index=12;text=skillet>
<Word index=13;text=for>
<Word index=14;text=at>
<Word index=15;text=least>
<Word index=16;text=2>
<Word index=17;text=weeks>
<Word index=18;text=of>
<Word index=19;text=using>
<Word index=20;text=....>
<Word index=21;text=when>
<Word index=22;text=i>
<Word index=23;text=recived>
<Word index=24;text=mine>
<Word index=25;text=i>
<Word index=26;text=washed>
<Word index=27;text=well>
<Word index=28;text=in>
<Word index=29;text=hot>
<Word index=30;text=water>
<Word index=31;text=,>
<Word index=32;text=even>
<Word index=33;text=tho>
<Word index=34;text=it>
<Word index=35;text=seasoned>
<Word index=36;text=your>
<Word index=37;text=gon>
<Word index=38;text=na>
<Word index=39;text=wan