In [65]:
import json
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import spacy
import mylib

In [66]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)

In [67]:
nlp = spacy.load("pretrained/spacy/en_core_web_lg/en_core_web_lg-2.3.1")
#nlp.add_pipe(nlp.create_pipe('sentencizer'))
nlp.path

PosixPath('pretrained/spacy/en_core_web_lg/en_core_web_lg-2.3.1')

# Original example

The original example from official docs (needed minor modification to work):
- https://huggingface.co/transformers/usage.html#extractive-question-answering
- https://github.com/huggingface/transformers/issues/220
- https://stackoverflow.com/questions/64901831/huggingface-transformer-model-returns-string-instead-of-logits

In [68]:
%%time
tokenizer = AutoTokenizer.from_pretrained("pretrained/bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained(
    "pretrained/bert-large-uncased-whole-word-masking-finetuned-squad",
    from_tf=True,
)
print(repr(model.config))

All TF 2.0 model weights were used when initializing BertForQuestionAnswering.

Some weights of BertForQuestionAnswering were not initialized from the TF 2.0 model and are newly initialized: ['bert.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "_name_or_path": "pretrained/bert-large-uncased-whole-word-masking-finetuned-squad",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

CPU times: user 15.3 s, sys: 7.45 s, total: 22.7 s
Wall time: 14.4 s


In [69]:
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
]

for question in questions:
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer_start_scores, answer_end_scores = model(**inputs).values()

    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")

Question: How many pretrained models are available in Transformers?
Answer: over 32 +

Question: What does Transformers provide?
Answer: general - purpose architectures

Question: Transformers provides interoperability between which frameworks?
Answer: tensorflow 2. 0 and pytorch



In [70]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19661 entries, 0 to 19660
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             19661 non-null  object
 1   pub_title      19661 non-null  object
 2   dataset_title  19661 non-null  object
 3   dataset_label  19661 non-null  object
 4   cleaned_label  19661 non-null  object
dtypes: object(5)
memory usage: 768.1+ KB


In [71]:
train.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degree Attainment: Do Low-SES Students Benefit?,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts 8 Years Later. Issue Brief. NCES 2005-026.,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male Students in Special Education,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of Postsecondary Education,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student Achievement",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study


In [72]:
tmp = []
titles = []
with open(f"input/train/{train.iloc[0].Id}.json") as in_file:
    sections = json.load(in_file)
    for section in sections:
        titles.append(section["section_title"])
        tmp.append(section["text"])
print(f"titles={titles}")

titles=['What is this study about?', 'Features of Dual Enrollment Programs', 'WWC Single Study Review', 'WWC Rating', 'Intervention group', 'Comparison group', 'Outcomes and measurement', 'Support for implementation', 'Reason for review', 'Degree attainment', "Bachelor's degree attainment", 'Study Notes:', 'Glossary of Terms', 'Multiple comparison adjustment', 'Quasi-experimental design (QED)', 'Randomized controlled trial (RCT)', 'Single-case design (SCD)', 'Standard deviation']


In [73]:
text = " ".join(tmp).strip()
print(f"len={len(text)}, text={text}")

len=11660, text=This study used data from the National Education Longitudinal Study (NELS:88) to examine the effects of dual enrollment programs for high school students on college degree attainment. The study also reported whether the impacts of dual enrollment programs were different for first generation college students versus students whose parents had attended at least some college. In addition, a supplemental analysis reports on the impact of different amounts of dual enrollment course-taking and college degree attainment.
Dual enrollment programs offer college-level learning experiences for high school students. The programs offer college courses and/or the opportunity to earn college credits for students while still in high school.
The intervention group in the study was comprised of NELS participants who attended a postsecondary school and who participated in a dual enrollment program while in high school (n = 880). The study author used propensity score matching methods to cr

In [74]:
doc = nlp(text)
sentences = [sent.string.strip() for sent in doc.sents]
print(f"{len(sentences)} sentences={sentences}")

95 sentences=['This study used data from the National Education Longitudinal Study (NELS:88) to examine the effects of dual enrollment programs for high school students on college degree attainment.', 'The study also reported whether the impacts of dual enrollment programs were different for first generation college students versus students whose parents had attended at least some college.', 'In addition, a supplemental analysis reports on the impact of different amounts of dual enrollment course-taking and college degree attainment.', 'Dual enrollment programs offer college-level learning experiences for high school students.', 'The programs offer college courses and/or the opportunity to earn college credits for students while still in high school.', 'The intervention group in the study was comprised of NELS participants who attended a postsecondary school and who participated in a dual enrollment program while in high school (n = 880).', 'The study author used propensity score match

In [76]:
question = "which dataset"
res = set()
i = 0
stride = 16
while i < len(sentences):
    tmp = sentences[i:i + stride]
    print(f"i={i}, len(tmp)={len(tmp)}")
    passage = " ".join(tmp)
    inputs = tokenizer.encode_plus(question, passage, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]
    sep_index = input_ids.index(tokenizer.sep_token_id)
    answer_start_scores, answer_end_scores = model(**inputs).values()
    print(f"answer_start_scores.shape={answer_start_scores.shape}, answer_end_scores.shape={answer_end_scores.shape}")
    ai = torch.argmax(answer_start_scores)
    aj = torch.argmax(answer_end_scores) + 1
    print(f"ai={ai}, aj={aj}, sep_index={sep_index}")
    if ai > sep_index:
        a = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[ai:aj]))
        a = mylib.clean_text(a)
        if len(a) != 0:
            res.add(a)
    i += stride
res_str = "|".join(res)
print(f"res={res_str}")

i=0, len(tmp)=16
answer_start_scores.shape=torch.Size([1, 468]), answer_end_scores.shape=torch.Size([1, 468])
ai=11, aj=15, sep_index=4
i=16, len(tmp)=16
answer_start_scores.shape=torch.Size([1, 306]), answer_end_scores.shape=torch.Size([1, 306])
ai=0, aj=5, sep_index=4
i=32, len(tmp)=16
answer_start_scores.shape=torch.Size([1, 295]), answer_end_scores.shape=torch.Size([1, 295])
ai=0, aj=181, sep_index=4
i=48, len(tmp)=16
answer_start_scores.shape=torch.Size([1, 467]), answer_end_scores.shape=torch.Size([1, 467])
ai=4, aj=5, sep_index=4
i=64, len(tmp)=16
answer_start_scores.shape=torch.Size([1, 279]), answer_end_scores.shape=torch.Size([1, 279])
ai=4, aj=5, sep_index=4
i=80, len(tmp)=15
answer_start_scores.shape=torch.Size([1, 347]), answer_end_scores.shape=torch.Size([1, 347])
ai=0, aj=5, sep_index=4
res=national education longitudinal study
