In [None]:
# Start writing code here...
import numpy as np 
import pandas as pd

pd.options.mode.chained_assignment = None 

import os, re
import json
import matplotlib.pyplot as plt
  
from IPython.display import display, clear_output
    
from tqdm import tqdm
import string

from sklearn.model_selection import train_test_split

os.listdir('/kaggle/input/coleridgeinitiative-show-us-the-data/')

In [None]:
validation = pd.read_csv("../input/colerigde-processed-text/validation.csv",index_col=0)
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [None]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

In [None]:
!pip install --no-index --find-links "../input/spacy3" spacy[cuda110]
!pip install --no-index --find-links "../input/spacy3" en_core_web_trf
!pip install --no-index --find-links "../input/spacy3" spacy_transformers

In [None]:
import spacy
from spacy.tokens import DocBin

In [None]:
print(spacy.__version__)

In [None]:
# Load the best model
try:
    spacy.require_gpu()
except:
    print("GPU not found")
    
nlp = spacy.load(R"../input/coledridge-challenge-models/model-best")

In [None]:
# Testing the model
doc = nlp('The supply of PCR reagents, trained lab personnel and the availability of laboratories with sufficient biocontainment levels are major challenges of SARS-CoV-2 detection in developing countries, such as Indonesia (Younes et al., 2020) . Therefore, it is not surprising that the tested people per week is still lower than the World Health Organization (WHO) standard (World Health Organization, 2020b) . Recently, SARS-CoV-2 with the D614G mutation became the most frequently detected globally, including South East Asia region (Korber et al., 2020; Nguyen et al., 2020) . Interestingly, SARS-CoV-2 with the G614 variant had significantly higher infectious titers than the original D614 virus, and COVID-19 patients with the G614 variant had a higher viral load than patients without the mutation (Korber et al., 2020) . A recent study showed that the SARS-CoV-2 with the G614 variant revealed increased infectivity, competitive fitness, and transmission than the wild-type D614 virus in human airway epithelial cells and hamster (Hou et al., 2020) . However, this mutation was not associated with the severity of COVID-19 (Korber et al., 2020; Nguyen et al., 2020) . Here, we aimed:\n(1) to report full-length genome sequences of SARS-CoV-2 collected from four COVID-19 patients in the Special Region of Yogyakarta and Central Java provinces, Indonesia;\n(2) to compare the clade distribution of full-length genome sequences from Indonesia (n = 60) from March to September 2020; and (3) to perform phylogenetic analysis of SARS-CoV-2 complete genomes from different countries, including Indonesia.') # input sample text
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

In [None]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [None]:
validation.head()

In [None]:
# Testing the model
doc = nlp("investigated the differences in college access and choice among students of different racialethnic groups using the national educational longitudinal study nels8892 and the beginning postsecondary student longitudinal study bps9092") # input sample text
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

In [None]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [None]:
submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

submission.head()

### Validate model

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [None]:
def extract_ds_from_text(path):
    extract_df = pd.read_json(path)
    
    dataset = []

    for idx, r in extract_df.iterrows():
        doc = nlp(r['text'])

        if doc.ents:
            for i in [ent.text for ent in doc.ents]:
                i = clean_text(i)
                if len(i) != 0 and i not in dataset:
                    dataset.append(str(i).strip())

    return dataset

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
jaccard("national education longitudinal study","education longitudinal study")

In [None]:
for index, rows in tqdm(validation.iterrows()):
    doc = nlp(rows['Text'])

    preds = []
    
    if doc.ents:
        for i in [ent.text for ent in doc.ents]:
            i = clean_text(i)
            if len(i) != 0:
                preds.append(str(i).strip())
    
    validation.loc[index, 'predictions'] = '|'.join(preds)

In [None]:
validation['Label'] = validation['Label'].apply(clean_text)

In [None]:
validation['truth'] = validation.groupby(['Id'])['Label'].transform(lambda x : '|'.join(x))
validation['predictions'] = validation.groupby(['Id'])['predictions'].transform(lambda x : '|'.join(x))
validation = validation.drop_duplicates("Id")

In [None]:
validation = validation[['Id', 'truth', 'predictions']]

In [None]:
validation.head(10)

In [None]:
TP = 0

FP = 0
FN = 0

for index, rows in validation.iterrows():
    predictions = np.array(str(rows['predictions']).split('|'))
    ground_truths = np.array(str(rows['truth']).split('|'))
    
    predictions = np.unique(predictions)
    ground_truths = np.unique(ground_truths)
    
    predictions = np.sort(predictions)

    for pred in predictions:
        for truth in ground_truths:
            if pred in truth:
                if jaccard(pred,truth) >= 0.5:
                    TP += 1
                else:
                    FP += 1
            else:
                FP += 1
                
            if truth not in predictions:
                FN += 1

In [None]:
print("True positives: {}".format(TP))
print("\n")
print("False positives: {}".format(FP))
print("False negatives: {}".format(FN))

In [None]:
beta = 0.5
precision = (TP/(TP+FP))
recall = (TP/(TP+FN))

fbeta = ((1 + beta**2) * precision * recall) / (beta**2 * precision + recall)

print("F0.5 score: {}".format(fbeta))

### Create Submissions file

In [None]:
example = "../input/coleridgeinitiative-show-us-the-data/test/2f392438-e215-4169-bebf-21ac4ff253e1.json"

print(extract_ds_from_text(example))

In [None]:
for index, rows in submission.iterrows():
    preds = extract_ds_from_text((test_files_path+"/"+rows["Id"]+".json"))
    submission.loc[index, 'PredictionString'] = '|'.join(preds)

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)