# Basic approach using regex and an electra model trained on squad2

In [None]:
from pathlib import Path

data_dir = Path('/kaggle/input/coleridgeinitiative-show-us-the-data')
test_dir = data_dir/'test'

import json

def get_document_text(filename, test=False):
    if test:
        filepath = test_dir/(filename+'.json')
    else:
        filepath = train_dir/(filename+'.json')
        
    with open(filepath, 'r') as f:
        return " ".join([_['text'] for _ in json.load(f)])
    return ""

In [None]:
import re
alphabets= "([A-Za-z])"
prefixes = re.compile("(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]")
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = re.compile("[.](co|net|org|io|gov|edu|us)")
etal = re.compile(r"(\bet al)[.]")
urls = re.compile("(www)[.]")
digits =  re.compile("[.]([0-9])")

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = prefixes.sub("\\1<prd>",text)
    text = websites.sub("<prd>\\1",text)
    text = urls.sub("\\1<prd>",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = etal.sub("\\1<prd>", text)
    text = digits.sub("<prd>\\1",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    if sentences[-1] == '':
        sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower().strip())

def shorten_sentences(sentences, max_length=80, overlap=25):
    """
    If a sentence is longer than `max_length`, break it into chunks of 
    length `max_length` with an overlap of length `overlap`.
    
    e.g. if the sentence has 50 tokens, max_length is 20, and overlap is 10.
    Then the first sentence will be token_i where i in [0,20)
    Second sentence will be token_i in [10,30).
    Third sentence [20, 40)
    Fourth [30, 50)
    Fifth [40, 60)
    """
    shortened_sentences = []
    for sentence in sentences:
        words = sentence.split()
        num_words = len(words)
        if num_words > max_length:
            for start_index in range(0, num_words, max_length - overlap):
                shortened_sentences.append(' '.join(words[start_index:start_index+max_length]))
        else:
            shortened_sentences.append(sentence)
    return shortened_sentences

In [None]:
# regex_match_sentences = [] # list of list of sentences found through regex, each element has a list of sentences, one element per id
# ids = []


# keywords = ['longitudinal', "national", "data", "model", "questionnaire", "from",  "according", "\buse\b", "\busing", "participants", "cohort", "studies", "study", "survey", "sample", "results"]
# keyword_pattern = re.compile(r"|".join(keywords))

# for filename in test_dir.iterdir():
#     file_id = filename.stem
#     ids.append(file_id)
#     document_text = get_document_text(file_id, test=True)
#     sentences = split_into_sentences(document_text)
    
#     file_sentences = [sentence for sentence in sentences if keyword_pattern.search(sentence.lower())]            
    
#     regex_match_sentences.append(file_sentences)

import fasttext

ft_model = fasttext.load_model("../input/coleridge-fasttext-classification/fasttext_model_coleridge.bin")

found_sentences = [] #sentences to be ran through NER model later, one element for each file id
ids = []

for filename in test_dir.iterdir():
    file_id = filename.stem
    ids.append(file_id)
    document_text = get_document_text(file_id, test=True)
    sentences = split_into_sentences(document_text)
    
    file_sentences = []
    for sentence in sentences:
        result = ft_model.predict(sentence.lower())
        if "has_dataset" in result[0][0]:
            file_sentences.append(sentence)
    
    found_sentences.append(file_sentences)

In [None]:
# import torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

# classifier_approved_sentences = []

# batch_size = 8
# tokenizer_path = "../input/roberta-tokenizer"
# text_classifier_model_path = '../input/coleridge-text-class-robertalarge/output-roberta-large'

# config = AutoConfig.from_pretrained(text_classifier_model_path)
# tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, config=config)
# model = AutoModelForSequenceClassification.from_pretrained(text_classifier_model_path)

# model.to("cuda")
# model.eval()

# with torch.no_grad():

#     for sentences in found_sentences:
#         file_sentences = []
#         for i in range(0, len(sentences), batch_size):
#             batch = sentences[i:i+batch_size]
#             inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=256)
#             input_ids = inputs["input_ids"].to("cuda")
#             attention_mask = inputs["attention_mask"].to("cuda")
#             outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#             logits = outputs.logits
# #             probas = logits.softmax(-1).tolist()
#             predictions = logits.argmax(-1).tolist()
#             for offset, prediction in enumerate(predictions):
#                 if prediction == 1:
#                     file_sentences.append(sentences[i+offset])
#         classifier_approved_sentences.append(file_sentences)

In [None]:
import gc

del ft_model
# del found_sentences
# del tokenizer
# del config

gc.collect()

In [None]:
from transformers import pipeline

electra_model_path = "../input/electra-squad2/electra_squad2"

model = pipeline("question-answering", model=electra_model_path, tokenizer=electra_model_path, device=0)

In [None]:
all_predictions = []

question_keywords = ["survey", "study", "questionnaire"]
question_template = "What is the name of the {keyword} being used?"


for sentences in found_sentences:
    file_predictions = []
    unq_preds = set()
    for sentence in sentences:
        output = ""
        for keyword in question_keywords:
            if re.search(keyword, sentence.lower()):
                output = model(question=question_template.format(keyword=keyword), context=sentence)
                break
        if output == "":
            output = model(question=question_template.format(keyword="data source"), context=sentence)     
        if output["score"] > 0.75:
            answer = output["answer"]
            if clean_text(answer) not in unq_preds:
                file_predictions.append(answer)
                unq_preds.add(clean_text(answer))
        
    all_predictions.append(file_predictions)

In [None]:
del model
gc.collect()

In [None]:
prediction_strings = []

for file_predictions in all_predictions:
    temp_predictions = []
    for pred in file_predictions:
        words = pred.split()
        if len(words) == 1 and words[0].isupper():
            temp_predictions.append(clean_text(pred))
        else:
            try:
                if words[0][0].islower() and words[1][0].islower():
                    continue
            except IndexError:
                pass
            if "et al." in pred:
                continue
            if pred.islower():
                continue
            temp_predictions.append(clean_text(pred))
    
    prediction_strings.append("|".join(temp_predictions))


In [None]:
# prediction_strings = ["|".join(x) for x in all_predictions]

In [None]:
import pandas as pd
submission_df = pd.DataFrame(data={"Id":ids, "PredictionString":prediction_strings})

submission_df.to_csv("submission.csv", index=False)
submission_df