In [1]:
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
from nltk.tokenize import sent_tokenize

In [2]:
import en_core_web_md

nlp = en_core_web_md.load()

In [3]:
def seg2sent(seg):
    phrase = ""
    for token in seg:
        phrase += token.text + " " 
    return phrase

In [4]:
def make_list_spo(doc):
    # taking a spacy doc object, return list of source-predicat-opinion
    ccomps = []
    for token in doc:
        if token.dep_ == 'ccomp':
            ccomps.append(token)
    list_spo = []
    for ccomp in ccomps:
        spo = {}
        #opinion
        spo['opinion'] = [tok for tok in ccomp.subtree]
        #predicat
        predicat = []
        for token in ccomp.head.lefts:
            if token.dep_ in ['prt','aux','neg','advmod']: 
                predicat.append(token)
        predicat.append(ccomp.head)   #ccomp.head is the principle predicat
        for token in ccomp.head.rights:
            if token.dep_ in ['prt','aux','neg','advmod']: 
                predicat.append(token)        
        spo['predicat'] = predicat
        #source
        if ccomp.head.dep_ == 'conj':
            spo['source'] = [x for tok in ccomp.head.head.children if tok.dep_ in ['nsubj'] for x in tok.subtree]            
        else:
            spo['source'] = [x for tok in ccomp.head.children if tok.dep_ in ['nsubj'] for x in tok.subtree]
        list_spo.append(spo)
        
    if len(list_spo) == 0:
        print("No opinion found !")
    
    return list_spo

In [5]:
df = pd.read_csv("/Users/rubing/Documents/Donnees/covax_content_sample_2000_medium_sized.csv").drop("Unnamed: 0", axis=1)

In [6]:
TEXT=df.extracted_text[2]
print(TEXT)

People who have gotten the new coronavirus and recovered can get it again in the future, health authorities say — the body does not become immune after infection.
On Wednesday, Japanese authorities reported the first confirmed case of reinfection. A tour guide in Osaka first tested positive for the coronavirus in late January, then was discharged from the hospital three weeks ago after showing signs of recovery. But she returned to the hospital after developing a sore throat and chest pain and tested positive for the coronavirus once again.
Zhan Qingyuan, director of pneumonia prevention and treatment at the China-Japan Friendship Hospital, warned last month that this could  happen.
"For those patients who have been cured, there is a likelihood of a relapse," Zhan said in a briefing on January 31 . "The antibody will be generated; however, in certain individuals, the antibody cannot last that long."
Reinfections among patients in China have been reported as well.
In total, the coronavi

Text 2, sent 0,4,5,6,17,20,22,23,28

In [9]:
DOC = nlp(TEXT)
sents = list(DOC.sents)

In [10]:
i = 0

sent = sents[i]
print(sent)

People who have gotten the new coronavirus and recovered can get it again in the future, health authorities say — the body does not become immune after infection.



In [11]:
print(sent)
print('\n')


list_spo = make_list_spo(sent)

for i, spo in enumerate(list_spo):
    print(f"Source: {seg2sent(spo['source'])}")
    print(f"Predicat: {seg2sent(spo['predicat'])}")
    print(f"Opinion: {seg2sent(spo['opinion'])}")
    print("\n")

People who have gotten the new coronavirus and recovered can get it again in the future, health authorities say — the body does not become immune after infection.



Source: health authorities 
Predicat: say 
Opinion: People who have gotten the new coronavirus and recovered can get it again in the future 


Source: health authorities 
Predicat: say 
Opinion: the body does not become immune after infection 




In [12]:
displacy.render(sent,style='dep',options={'compact':True})

In [321]:
DOC = nlp(TEXT)
for i,sent in enumerate(DOC.sents):
    print('\n')
    print(str(i)+": "+sent.text)
    print('\n')
    list_spo = make_list_spo(sent)
    for i, spo in enumerate(list_spo):
        print(f"Source: {seg2sent(spo['source'])}")
        print(f"Predicat: {seg2sent(spo['predicat'])}")
        print(f"Opinion: {seg2sent(spo['opinion'])}")
        print("\n")



0: People who have gotten the new coronavirus and recovered can get it again in the future, health authorities say — the body does not become immune after infection.



Source: health authorities 
Predicat: say 
Opinion: People who have gotten the new coronavirus and recovered can get it again in the future 


Source: health authorities 
Predicat: say 
Opinion: the body does not become immune after infection 




1: On Wednesday, Japanese authorities reported the first confirmed case of reinfection.


No opinion found !


2: A tour guide in Osaka first tested positive for the coronavirus in late January, then was discharged from the hospital three weeks ago after showing signs of recovery.


No opinion found !


3: But she returned to the hospital after developing a sore throat and chest pain and tested positive for the coronavirus once again.



No opinion found !


4: Zhan Qingyuan, director of pneumonia prevention and treatment at the China-Japan Friendship Hospital, warned last m