In [None]:
# Start writing code here...
import numpy as np 
import pandas as pd
  
from IPython.display import display, clear_output
    
from tqdm import tqdm

from ast import literal_eval

from transformers import AutoTokenizer, AutoModel

In [None]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

In [None]:
!pip install --no-index --find-links "../input/spacy3" spacy[cuda110]
!pip install --no-index --find-links "../input/spacy3" en_core_web_trf
!pip install --no-index --find-links "../input/spacy3" spacy_transformers

In [None]:
import spacy
from spacy.tokens import DocBin

In [None]:
print(spacy.__version__)

In [None]:
%env TRANSFORMERS_OFFLINE=1
%env HF_DATASETS_OFFLINE=1

In [None]:
# Load pre-existing spacy model
try:
    spacy.require_gpu()
except:
    print("GPU not found")

nlp = spacy.load('en_core_web_trf', disable=["tagger","parser","attribute_ruler","lemmatizer"])

In [None]:
name = "../input/scibert-scivocab-uncased"
local_path = "./scibert-scivocab-uncased"
model = AutoModel.from_pretrained(name)
tokenizer = AutoTokenizer.from_pretrained(name)

# add the setting (note that you can modify tokenizer.model_max_length on the fly, 
# but frustratingly this change isn't saved as part of the saved config)
tokenizer.init_kwargs["model_max_length"] = 512

# save
tokenizer.save_pretrained(local_path)
model.save_pretrained(local_path)

In [None]:
train = pd.read_csv("../input/colerigde-processed-text/train.csv",index_col=0)
validation = pd.read_csv("../input/colerigde-processed-text/validation.csv",index_col=0)

train = train.reset_index(drop=True)
validation = validation.reset_index(drop=True)

In [None]:
train['dataset_index'] = train['dataset_index'].apply(lambda x: literal_eval(x))
validation['dataset_index'] = validation['dataset_index'].apply(lambda x: literal_eval(x))

In [None]:
print(train.shape)
train.head()

In [None]:
print(validation.shape)
validation.head()

In [None]:
train["dataset_index"][0]

In [None]:
TRAIN_DATA = []
VALIDATION_DATA = []

for index, row in train.iterrows():
    TRAIN_DATA.append((row["Text"],{"entities":row["dataset_index"]}))
    
for index, row in validation.iterrows():
    VALIDATION_DATA.append((row["Text"],{"entities":row["dataset_index"]}))
    
print(TRAIN_DATA[10])
print(VALIDATION_DATA[10])

In [None]:
# Convert SpaCy v2 to SpaCy v3 object

db = DocBin() # create a DocBin object
i = 1

for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            i += 1
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

print("Skipped "+ str(i) +" entities")
    
db.to_disk("train.spacy") # save the docbin object

In [None]:
db = DocBin() # create a DocBin object
i = 1

for text, annot in tqdm(VALIDATION_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            i += 1
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

print("Skipped "+ str(i) +" entities")
    
db.to_disk("validation.spacy") # save the docbin object

In [None]:
!python -m spacy init config "base_config.cfg" --lang="en" --pipeline=["transformer","ner"] --optimize="accuracy" --gpu --force

In [None]:
import configparser

# Read config file
edit = configparser.ConfigParser()
edit.read("./base_config.cfg")

arch = edit['components.transformer.model']
arch["name"] = '"'+local_path+'"'

edit.remove_option('components.ner.model', 'no')

edit.remove_option('training.optimizer', 'l2')
edit.remove_option('training.optimizer', 'l2_is_weight_decay')

# Write changes back to file
with open('./base_config.cfg', 'w') as configfile:
    edit.write(configfile)

In [None]:
!python -m spacy init fill-config "base_config.cfg" "config.cfg"

In [None]:
!python -m spacy train "config.cfg" \
    --output "./output" \
    --paths.train "train.spacy" \
    --paths.dev "validation.spacy" \
    --gpu-id 0 \
    --nlp.batch_size 64 \
    --training.dropout 0.2 \
    --training.patience 0 \
    --training.max_steps 9000 \
    --training.logger.progress_bar True

In [None]:
# Load the best model
spacy.require_gpu()
nlp_best = spacy.load(R"./output/model-best")

In [None]:
# Testing the model
doc = nlp_best('The supply of PCR reagents, trained lab personnel and the availability of laboratories with sufficient biocontainment levels are major challenges of SARS-CoV-2 detection in developing countries, such as Indonesia (Younes et al., 2020) . Therefore, it is not surprising that the tested people per week is still lower than the World Health Organization (WHO) standard (World Health Organization, 2020b) . Recently, SARS-CoV-2 with the D614G mutation became the most frequently detected globally, including South East Asia region (Korber et al., 2020; Nguyen et al., 2020) . Interestingly, SARS-CoV-2 with the G614 variant had significantly higher infectious titers than the original D614 virus, and COVID-19 patients with the G614 variant had a higher viral load than patients without the mutation (Korber et al., 2020) . A recent study showed that the SARS-CoV-2 with the G614 variant revealed increased infectivity, competitive fitness, and transmission than the wild-type D614 virus in human airway epithelial cells and hamster (Hou et al., 2020) . However, this mutation was not associated with the severity of COVID-19 (Korber et al., 2020; Nguyen et al., 2020) . Here, we aimed:\n(1) to report full-length genome sequences of SARS-CoV-2 collected from four COVID-19 patients in the Special Region of Yogyakarta and Central Java provinces, Indonesia;\n(2) to compare the clade distribution of full-length genome sequences from Indonesia (n = 60) from March to September 2020; and (3) to perform phylogenetic analysis of SARS-CoV-2 complete genomes from different countries, including Indonesia.') # input sample text
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

In [None]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter