In [1]:
import spacy
import pandas as pd
import numpy as np
from nltk.corpus import stopwords 
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
import string


In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
news_data_df = pd.read_csv('../../COVID-19/Data/all_documents_till_29th_March.csv')
news_data_df.head()

Unnamed: 0,text
0,i sincerely hope the uk public unite boycott ...
1,Imagine if doctors or nurses withheld their s...
2,Garrett Fedl govt is blocking shipments from ...
3,langer today we discharged our th patient fro...
4,peloton races berocca doses calling fans how e...


In [4]:
news_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6658 entries, 0 to 6657
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6653 non-null   object
dtypes: object(1)
memory usage: 52.1+ KB


In [5]:
tweets_df = pd.read_csv('../../COVID-19/Data/tweets_with_location_raw_16th_April.csv')
tweets_df.head()

Unnamed: 0,geo,geo_enabled,tweet,user_handle,location
0,,True,ବିନାଶ କାଳେ ବିପରୀତ ବୁଦ୍ଧି ।\nPeople order as mu...,SanjitaPanigra2,"Bhubaneshwar, India"
1,,False,"In the harsh times of this #Pandemic, Atom urg...",AtomTech_India,"Mumbai, Maharashtra"
2,,True,"@barandbench @MoHFW_INDIA Next, a petition dem...",kochattil,Hyderabad
3,,True,9 more #COVID19 cases reported in Haryana toda...,ANI,India
4,,True,Self-discipline and self-motivation is the key...,ecosmob,India


In [6]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   geo          1 non-null      object
 1   geo_enabled  206 non-null    bool  
 2   tweet        206 non-null    object
 3   user_handle  206 non-null    object
 4   location     206 non-null    object
dtypes: bool(1), object(4)
memory usage: 6.8+ KB


In [7]:
tweets_df.drop(columns=['geo','user_handle','location'],axis = 1,inplace=True)
documents = pd.DataFrame()
documents['text'] = pd.concat([news_data_df.text,tweets_df.tweet],axis =0 ,ignore_index=True)
documents.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6864 entries, 0 to 6863
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6859 non-null   object
dtypes: object(1)
memory usage: 53.8+ KB


In [8]:
# Shuffle all Data
# documents = documents.sample(frac=1).reset_index(drop=True)

In [9]:
# documents.reset_index(inplace=True)

In [10]:
# Remove HTML Tags
def remove_html(text):
    soup = BeautifulSoup(text,'lxml')
    html_free_text = soup.get_text()
    return html_free_text

# Remove Punctuations
def punctuation_remover(text):
    punctuation_free_text = " ".join([word for word in text if word \
                                    not in string.punctuation])
    return punctuation_free_text


# Stop Word Removal
cached_stop_words = stopwords.words('english') # Provides 70 X Speedup
def stop_words_remover(text):
#     text = text.lower().split()
    words = [word for word in text if \
             word not in cached_stop_words]
    return words

# Convert to lower case
def convert_to_lowercase(tokens):
    low = []
    for tok in tokens:
        low.append(tok.lower().strip())
    return low

# Lemmatization
def lemmatize_words(text):
    words = nlp(str(text))
    return [word.lemma_ for word in words if word.lemma_ != '-PRON-']  

def replace_urls(tokens):
    re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", tokens)
    return tokens


def remove_short_strings(text, length):
    array = []
    for word in text:
        if len(word) > length:
            array.append(word)
    return array

In [11]:
def clean_text_for_tasks(text, for_pos_tagging = False):
    cleaned_text = remove_html(text)
    cleaned_text = replace_urls(cleaned_text)
    cleaned_text = word_tokenize(cleaned_text)
   
    
    if for_pos_tagging is False:
        cleaned_text = convert_to_lowercase(cleaned_text)
        cleaned_text = stop_words_remover(cleaned_text)
        cleaned_text = lemmatize_words(cleaned_text)
    cleaned_text = remove_short_strings(cleaned_text,2)
    cleaned_text = punctuation_remover(cleaned_text)
    
    return cleaned_text

In [12]:
from tqdm.notebook import tqdm_notebook
# from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
documents.dropna(inplace=True)
documents['cleaned_text']  = documents.text.apply(lambda x : clean_text_for_tasks(x ,for_pos_tagging=True))

In [13]:
documents.head()

Unnamed: 0,text,cleaned_text
0,i sincerely hope the uk public unite boycott ...,sincerely hope the public unite boycott are mo...
1,Imagine if doctors or nurses withheld their s...,Imagine doctors nurses withheld their services...
2,Garrett Fedl govt is blocking shipments from ...,Garrett Fedl govt blocking shipments from Chin...
3,langer today we discharged our th patient fro...,langer today discharged our patient from our c...
4,peloton races berocca doses calling fans how e...,peloton races berocca doses calling fans how e...


In [14]:
documents.sample(frac=1).reset_index(drop=True)

Unnamed: 0,text,cleaned_text
0,looking at the trend of covid cases in malays...,looking the trend covid cases malaysia and wfh...
1,kr shahi coz journos may ask y u did nt put p...,shahi coz journos may ask did put preventive mea
2,Today I told a year old that he needs intubat...,Today told year old that needs intubation was ...
3,eng to protect yourself and others stay away ...,eng protect yourself and others stay away from...
4,astonished that doctors in ireland have recei...,astonished that doctors ireland have received ...
...,...,...
6854,In Spain things are so bad docs are killing t...,Spain things are bad docs are killing the elde...
6855,NISHTHA Be Socially Responsible Follow HomeQu...,NISHTHA Socially Responsible Follow HomeQuaran...
6856,the washoe county health district repoed new ...,the washoe county health district repoed new c...
6857,breaking actor gave lakhs for fefsi workers w...,breaking actor gave lakhs for fefsi workers wh...


In [15]:
documents.head()

Unnamed: 0,text,cleaned_text
0,i sincerely hope the uk public unite boycott ...,sincerely hope the public unite boycott are mo...
1,Imagine if doctors or nurses withheld their s...,Imagine doctors nurses withheld their services...
2,Garrett Fedl govt is blocking shipments from ...,Garrett Fedl govt blocking shipments from Chin...
3,langer today we discharged our th patient fro...,langer today discharged our patient from our c...
4,peloton races berocca doses calling fans how e...,peloton races berocca doses calling fans how e...


In [16]:
from spacy.matcher import PhraseMatcher

phrase_matcher = PhraseMatcher(nlp.vocab,attr='LOWER')
phrases = [ 'cure' ,'isolation', 'lockdown','quarantine','vaccine', 'positive','tests' ]
# phrases = ['mutation']
label = 'COVIDACTION'

patterns = [nlp(text) for text in phrases]

In [17]:
phrase_matcher.add(label,None,*patterns)

In [18]:
test_doc = "A leading public health expert said Friday that Britain likely has the highest coronavirus death rate in Europe due to what he described as “system errors,” while the government defended its record in responding to the pandemic.Anthony Costello, director of the Institute for Global Health at University College London, said the U.K. “could see 40,000 deaths” by the time the first wave of the country's outbreak is over. This virus has been in India for 3 months, the mutation doesn’t happen too quickly. Whatever vaccine comes out now, it will work in the future as well"

In [19]:
test_match = nlp(test_doc)
matches = phrase_matcher(test_match)
[match for match in matches]

[(1621438952046511751, 98, 99)]

In [20]:
match_id, start, end = matches[0]
print(nlp.vocab.strings[match_id], test_match[start:end])

COVIDACTION vaccine


In [21]:
sample_sentences = [_ for _ in documents.loc[lambda d: d['cleaned_text'].str.lower().str.contains("isolation") | d['cleaned_text'].str.lower().str.contains("vaccine") | d['cleaned_text'].str.lower().str.contains("hospital") ]['cleaned_text']] 

In [22]:
len(sample_sentences)

312

In [23]:
import random

sample_sentences_for_training = random.choices(sample_sentences,k = 4)

In [24]:
sample_sentences_for_training

['this isolation period making realise that need work fkn hard and buy myself house where can live',
 'lying was sent the covid isolation ward room major hospital from separate urgent care facility',
 'during the third pandemic the bubonic plague savitribai carried patient her back the hospital the process',
 'context overburdened hospital medical workers will have quarantined for possible infection while the deliv']

In [25]:
# with open('sample_training.txt', 'w') as f:
#     for item in sample_sentences_for_training:
#         f.write("%s\n" % item)
# f.close()       

### Spacy Training Data Annotater
 https://medium.com/@manivannan_data/how-to-train-ner-with-custom-training-data-using-spacy-188e0e508c6

In [26]:
training_data = [('Germany has accepted nearly Italian COVID patients far Most them were airlifted hospitals speci\r', {'entities': [(70, 79, 'COVID_ACTION')]}),
('Kashmir people who came contact with Covid-19 patient put quarantineForty-eight people who had come contact with the first coronavirus fatality Kashmir are quarantine and authorities are touch with the states visited the man track those who may have been infected him official said Thursday The 65-year-old man hailing from Hyderpora area the city here died Thursday morning becoming the first fatality Jammu and Kashmir due coronavirus Four his contacts the valley all from north Kashmir Bandipora district also tested positive Wednesday The person attended religious gatherings New Delhi Deoband Uttar Pradesh Jammu Samba Sopore and then came back here Srinagar and was admitted chest diseases hospital where expired Director Health Services Kashmir Samir Matoo told PTI\r', {'entities': [(513, 528, 'COVID_TESTS'), (681, 695, 'COVID_TESTS'), (696, 704, 'COVID_TREATMENT')]}),
('rapid access neutronscience instruments resources for researchers working medicines vaccine for', {'entities': [(84,91, 'COVID_TREATMENT')]}),('update last evening the executive vice chairperson apollo hospitals preetha reddy met with health officials', {'entities': [(58, 67, 'COVID_TREATMENT'), (91, 107, 'COVID_ACTION')]}),
('Nashik District administration Ahmednagar releases first positive person after his test turned out negative following the treatment given isolation ward Ahmednagar The patients now have remain home quarantine for next days', {'entities': [(138, 147, 'COVID_ACTION'), (198, 208, 'COVID_ACTION'), (99, 107, 'COVID_TESTS'), (57, 65, 'COVID_TESTS')]}),
('India sends first shipment of 5.5 million tablets of hydroxychloroquine to UAE, another consignment enroute to Nepal', {'entities': [(53, 71, 'COVID_TREATMENT'), (42, 49, 'COVID_TREATMENT')]}),
('Ambassador of India to Afghanistan Vinay Kumar today handed over the consignments of 300,000 tablets of hydroxychloroquine and 70,000 tablets of paracetamol tablets to Minister of Public Health Ferozuddin Feroz: Embassy of India, Kabul', {'entities': [(104, 122, 'COVID_TREATMENT'), (145, 156, 'COVID_TREATMENT')]})]

In [27]:
from __future__ import unicode_literals, print_function
import random
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse 

def train_spacy_model(training_data,n_iter,model=None):

    # model = 'en_core_web_sm'
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner,last=True)
    else:
        ner = nlp.get_pipe('ner')
    
    ner.add_label('COVID_TREATMENT')
    ner.add_label('COVID_ACTION')
    ner.add_label('COVID_TESTS')

    for _, annotations in training_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    optimizer = nlp.begin_training()
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(training_data)
            batches = minibatch(training_data, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

        # for itn in range(100):
        #     random.shuffle(training_data)
        #     for raw_text, entity_offsets in training_data:
        #         doc = nlp.make_doc(raw_text)
        #         gold = GoldParse(doc, entities=entity_offsets)
        #         nlp.update([doc], [gold], drop=0.5, sgd=optimizer)

    test_sentence = 'zara owner offers make scrubs for spain hospitals covid esp covid'
    doc = nlp(test_sentence)

    print('Entities',[(ent.text,ent.label_) for ent in doc.ents])


    for text, _ in training_data:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        

    return nlp




In [28]:
new_trained_model = train_spacy_model(training_data=training_data,n_iter = 20)

Created blank 'en' model
Losses {'ner': 204.56548881530762}
Losses {'ner': 99.23289455380291}
Losses {'ner': 29.79086407656081}
Losses {'ner': 28.943086289350504}
Losses {'ner': 28.276060064439662}
Losses {'ner': 25.10290589954093}
Losses {'ner': 22.19757759727395}
Losses {'ner': 19.064093712849882}
Losses {'ner': 24.205701709985345}
Losses {'ner': 18.075559342823478}
Losses {'ner': 14.761023444001152}
Losses {'ner': 12.725316102060505}
Losses {'ner': 14.0310277220158}
Losses {'ner': 11.202549027475477}
Losses {'ner': 29.00807052020705}
Losses {'ner': 11.506148487423761}
Losses {'ner': 6.346608908398621}
Losses {'ner': 8.513994474139569}
Losses {'ner': 7.23219952023791}
Losses {'ner': 4.860264432078506}
Entities [('scrubs', 'COVID_TREATMENT')]
Entities [('hydroxychloroquine', 'COVID_TREATMENT'), ('paracetamol', 'COVID_TREATMENT')]
Entities [('hospitals', 'COVID_TREATMENT'), ('health officials', 'COVID_ACTION')]
Entities [('positive', 'COVID_TESTS'), ('negative', 'COVID_TESTS'), ('isola

In [29]:
random.choices(sample_sentences)

['innovators help beat social isolation deliver remote care and tackle mental health issues launched tech challenge']

In [30]:
doc = new_trained_model('Nashik District administration Ahmednagar releases first positive person after his test turned out negative following the treatment given isolation ward Ahmednagar The patients now have remain home quarantine for next days')

print('Entities',[(ent.text,ent.label_) for ent in doc.ents])

Entities [('positive', 'COVID_TESTS'), ('negative', 'COVID_TESTS'), ('isolation', 'COVID_ACTION'), ('quarantine', 'COVID_ACTION')]


In [40]:
from spacy import displacy
doc2 = new_trained_model('India sends first shipment of 5.5 million tablets of hydroxychloroquine to UAE, another consignment enroute to Nepal')
print('Entities',[(ent.text,ent.label_) for ent in doc2.ents])


# colors = {"COVID_TESTS": "#FFC300"}
# options = {"ents": ["COVID_TESTS"], "colors": colors}
# displacy.serve(doc2, style="ent")

Entities [('tablets', 'COVID_TREATMENT'), ('hydroxychloroquine', 'COVID_TREATMENT')]


### Save the model

In [45]:
def save_model(model,new_model_name,output_dir):
    model.meta['name'] = new_model_name  # rename model
    model.to_disk(output_dir)
    print("Saved model to", output_dir)    

In [46]:
save_model(new_trained_model,'en_covid','./model/')

Saved model to ./model/


In [47]:
# Test the saved model
test_sentence = 'India sends first shipment of 5.5 million tablets of hydroxychloroquine to UAE, another consignment enroute to Nepal'
print(test_sentence)
print("Loading from", './model/')
nlp2 = spacy.load('./model/')
doc2 = nlp2(test_sentence)
print('Entities',[(ent.text,ent.label_) for ent in doc2.ents])

India sends first shipment of 5.5 million tablets of hydroxychloroquine to UAE, another consignment enroute to Nepal
Loading from ./model/
Entities [('tablets', 'COVID_TREATMENT'), ('hydroxychloroquine', 'COVID_TREATMENT')]


### Creating training data using Matcher

In [105]:
def covid_action_pattern():
    patterns = [
                [{'LOWER' : 'cure'}],
                    [{'LOWER' : 'isolation'}],
                    [{'LOWER' : 'lockdown'}],
                    [{'LOWER' : 'quarantine'}],
                    [{'LOWER' : 'positive'}],
                    [{'LOWER' : 'tests'}],
                    [{'LOWER' : 'curfew'}],
                  [
                    
                    {'LOWER' : 'social'},
                    {'IS_PUNCT':True, 'OP': '?'},
                    {'LOWER' : 'distancing'}
                ]

            ]

    return patterns

In [106]:
def covid_treatment_pattern():
    patterns = [
                [{'LOWER' : 'paracetamol'}],
                [{'LOWER' : 'vaccine'}],
                [{'LOWER' : 'tablets'}],
                [{'LOWER' : 'medicine'}],
             ]

    return patterns

In [129]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab, validate=True)
matcher.add("COVID_ACTION", None, *covid_action_pattern())
# matcher.add("COVID_TREATMENT", None, *covid_treatment_pattern())

In [130]:
def parse_train_data(doc,entity_to_tag):
    detections = [(doc[start:end].start_char, doc[start:end].end_char, entity_to_tag ) for idx, start, end in matcher(doc)]
    return (doc.text, {'entities': detections})

text = 'Germany has accepted nearly Italian COVID patients far Most them were airlifted hospitals  came contact with Covid-19 patient put quarantine Forty-eight people who had come contact with thes are touch with the states visited the man track those who may have been infected are being given medicine him official saity here died Thursday morning becoming the first fatality Jammu and Kashmir due coronavirus Four also tested positive Wednesday The person attended religious gatherings New Delhi admitted chest diseases hospital where expired Director Health Services Kashmir'
parse_train_data(nlp(text),'COVID_ACTION')

('Germany has accepted nearly Italian COVID patients far Most them were airlifted hospitals  came contact with Covid-19 patient put quarantine Forty-eight people who had come contact with thes are touch with the states visited the man track those who may have been infected are being given medicine him official saity here died Thursday morning becoming the first fatality Jammu and Kashmir due coronavirus Four also tested positive Wednesday The person attended religious gatherings New Delhi admitted chest diseases hospital where expired Director Health Services Kashmir',
 {'entities': [(130, 140, 'COVID_ACTION'), (422, 430, 'COVID_ACTION')]})

In [112]:
text[288:296]

'medicine'

In [126]:
import spacy
from spacy.pipeline import EntityRuler

nlp = spacy.load("en_core_web_sm")
ruler = EntityRuler(nlp)
patterns = [{"label": "COVID_ACTION", 
            "pattern": [{"LOWER": {"IN": ["quarantine", "curfew", "positive"]}}]          
            }]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)

doc = nlp(text)
print([(ent.text, ent.label_) for ent in doc.ents])

[('Germany', 'GPE'), ('Italian', 'NORP'), ('COVID', 'ORG'), ('quarantine', 'COVID_ACTION'), ('Forty-eight', 'CARDINAL'), ('Thursday', 'DATE'), ('morning', 'TIME'), ('first', 'ORDINAL'), ('Jammu', 'GPE'), ('Kashmir', 'LOC'), ('Four', 'CARDINAL'), ('positive', 'COVID_ACTION'), ('Wednesday', 'DATE'), ('New Delhi', 'GPE')]
