In [121]:
import re
import spacy
from spacy import displacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [122]:
# get text from html and parse it
response = requests.get('http://www.bbc.com/news')
doc = BeautifulSoup(response.text, 'html.parser')
headlines = doc.find_all('h3')
stories_list = []

for headline in headlines:
    print(headline.text)
    stories_list.append(headline.text)

# Now that we're done, convert to a CSV and save.
# If you don't use index=False, you'll get an ugly dataframe!
import pandas as pd
df = pd.DataFrame(stories_list)
df.to_csv("bbc.csv", index=False)

FBI took top secret documents from Trump's home
FBI took top secret documents from Trump's home
Rushdie on ventilator and may lose eye after attack
Salman Rushdie: The writer who emerged from hiding
'We have lost a bright light' - Anne Heche family
Drought highlights dangers for electricity supply
One killed as high winds hit Spanish festival
Gunman kills 11 after family dispute in Montenegro
US probes Southern Baptist sex abuse claims
England's drought could last into next year
Kenya elections 2022: Live result updates
King Goat removed from 'throne' as Irish heat soars
Kenya elections 2022: Live result updates
King Goat removed from 'throne' as Irish heat soars
Suspected bank robber rescued from Rome tunnel
The man caring for an island loved by Star Wars fans
Ibiza: The birth of the 'party island'
BBC World News TV
BBC World Service Radio
We fled the Taliban for our Olympic dream
Defying jihadists to go to school in Mozambique
Volcanoes and supermoons: Photos of the week
The ravers c

In [127]:
# import wikipedia sentences, they have 4318 rows
headlines = pd.read_csv("bbc.csv")
headlines.shape

(49, 1)

In [128]:
headlines

Unnamed: 0,0
0,FBI took top secret documents from Trump's home
1,FBI took top secret documents from Trump's home
2,Rushdie on ventilator and may lose eye after attack
3,Salman Rushdie: The writer who emerged from hiding
4,'We have lost a bright light' - Anne Heche family
5,Drought highlights dangers for electricity supply
6,One killed as high winds hit Spanish festival
7,Gunman kills 11 after family dispute in Montenegro
8,US probes Southern Baptist sex abuse claims
9,England's drought could last into next year


In [129]:
# exact sentence example
nlp = spacy.load('en_core_web_sm')

doc = nlp("Salman Rushdie: The writer who emerged from hiding")

for tok in doc:
  print(tok.text, "...", tok.dep_)

Salman ... compound
Rushdie ... ROOT
: ... punct
The ... det
writer ... appos
who ... nsubj
emerged ... relcl
from ... prep
hiding ... pobj


In [130]:
def get_entities(sent):

  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
      
   #############################################################

  return [ent1.strip(), ent2.strip()]

In [None]:
get_entities("Salman Rushdie: The writer who emerged from hiding.")

['Salman who', 'hiding']

In [None]:
entity_pairs = []

In [None]:
entity_pairs[10:20]

[]

In [131]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", [pattern], on_match=None) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [132]:
# verb
get_relation("Salman Rushdie: The writer who emerged from hiding.")

'Rushdie'

In [133]:
# the column name is 0
relations = [get_relation(i) for i in tqdm(headlines['0'])]

100%|██████████| 49/49 [00:00<00:00, 97.96it/s]


In [134]:
# verb count
pd.Series(relations).value_counts()[:50]

took top           2
updates            2
removed from       2
city of            1
patients in        1
visit              1
Meet               1
language           1
problem for        1
villages           1
puppet             1
Is dark            1
staying at         1
follow             1
have               1
register new       1
cut at             1
Originals on       1
leaves             1
revealed           1
knocks             1
newsletter         1
app                1
Predicting         1
condemn            1
lose               1
rescued from       1
Rushdie            1
lost               1
highlights         1
hit Spanish        1
kills              1
probes Southern    1
last into next     1
man                1
know about         1
Ibiza              1
TV                 1
Radio              1
fled               1
go to              1
Volcanoes          1
ravers             1
be                 1
Get in             1
dtype: int64

In [None]:
import nltk  

textsample ="Kenya elections 2022: Live result updates. King Goat removed from 'throne' as Irish heat soars. Kenya elections 2022: Live result updates. Drought on the Rhine: 'We have 30cm of water left'. Hacking not behind Kenya’s slow count - commission. Nuclear warning and clean-up raves - Ukraine round-up."  

sentences = nltk.sent_tokenize(textsample)  
words = nltk.word_tokenize(textsample)  
sentences 
[w for w in words if w.isalpha()]

In [None]:
sentences

['Kenya elections 2022: Live result updates.',
 "King Goat removed from 'throne' as Irish heat soars.",
 'Kenya elections 2022: Live result updates.',
 "Drought on the Rhine: 'We have 30cm of water left'.",
 'Hacking not behind Kenya’s slow count - commission.',

In [None]:
a_list = nltk.tokenize.sent_tokenize(textsample)

In [None]:
print(a_list)



In [None]:
# extract subject
source = [i[0] for i in entity_pairs]
# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

ValueError: ignored

In [None]:
# Make a search with the Wikipedia API

In [None]:
# create a directed-graph from a dataframe
G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

NameError: ignored

In [None]:
plt.figure(figsize=(12,12))

pos = nx.spring_layout(G)
nx.draw(G, with_labels=True, node_color='tan', edge_cmap=plt.cm.Greys, pos = pos)
plt.show()

NameError: ignored

<Figure size 864x864 with 0 Axes>

In [None]:
H=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="composed by"], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(H, k = 0.9) # k regulates the distance between nodes
nx.draw(H, with_labels=True, node_color='grey', node_size=1500, edge_cmap=plt.cm.Greys, pos = pos)
plt.show()

NameError: ignored

In [None]:
I=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="written by"], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(I, k = 0.5)
nx.draw(I, with_labels=True, node_color='tan', node_size=1500, edge_cmap=plt.cm.Greys, pos = pos)
plt.show()

NameError: ignored

In [None]:
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="released in"], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5)
nx.draw(G, with_labels=True, node_color='silver', node_size=1500, edge_cmap=plt.cm.Greys, pos = pos)
plt.show()

NameError: ignored