In [1]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [2]:
# import wikipedia sentences
candidate_sentences = pd.read_csv(r"E:\Work\workbackups\llm\ayurveda_two.csv")
candidate_sentences.shape

(15, 2)

In [3]:
candidate_sentences

Unnamed: 0.1,Unnamed: 0,sentence
0,0,Ayurveda is an ancient healing system
1,1,It means the science of life
2,2,It helps people to stay healthy
3,3,"It uses natural methods like herbs, diet, yoga, and massage"
4,4,"Ayurveda considers the mind, body, and spirit of each person"
5,5,It says that everyone has a different type of energy
6,6,"There are three types of energy called vata, pitta, and kapha"
7,7,"They affect how a person thinks, feels, and acts"
8,8,They also affect the health and balance of the body
9,9,Ayurveda helps people to find out their type of energy and how to balance it


In [4]:
candidate_sentences.columns

Index(['Unnamed: 0', 'sentence'], dtype='object')

In [5]:
candidate_sentences = pd.DataFrame(candidate_sentences['sentence'])
candidate_sentences

Unnamed: 0,sentence
0,Ayurveda is an ancient healing system
1,It means the science of life
2,It helps people to stay healthy
3,"It uses natural methods like herbs, diet, yoga, and massage"
4,"Ayurveda considers the mind, body, and spirit of each person"
5,It says that everyone has a different type of energy
6,"There are three types of energy called vata, pitta, and kapha"
7,"They affect how a person thinks, feels, and acts"
8,They also affect the health and balance of the body
9,Ayurveda helps people to find out their type of energy and how to balance it


In [6]:
candidate_sentences.shape

(15, 1)

In [7]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [8]:
get_entities("the film had 200 patents")

['film', '200  patents']

In [9]:
entity_pairs = []

for i in tqdm(candidate_sentences["sentence"]):
  entity_pairs.append(get_entities(i))

100%|██████████| 15/15 [00:00<00:00, 139.44it/s]


In [10]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", [pattern], on_match = None)  

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [11]:
get_relation("John completed the task")

'completed'

In [12]:
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]

100%|██████████| 15/15 [00:00<00:00, 140.81it/s]


In [13]:
# extract subject
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

In [14]:
kg_df

Unnamed: 0,source,target,edge
0,Ayurveda,,is
1,It,life,means
2,people,,helps
3,It,natural herbs,uses natural
4,Ayurveda,person,considers
5,everyone,different energy,says
6,,three energy,are
7,how person,,affect
8,They,also body,affect
9,people,how it,helps


In [15]:
for i in range(len(kg_df['source'])):
    if kg_df['source'][i] == 'It' or kg_df['source'][i] == 'it':
        kg_df['source'][i] = 'Ayurveda'

In [16]:
kg_df

Unnamed: 0,source,target,edge
0,Ayurveda,,is
1,Ayurveda,life,means
2,people,,helps
3,Ayurveda,natural herbs,uses natural
4,Ayurveda,person,considers
5,everyone,different energy,says
6,,three energy,are
7,how person,,affect
8,They,also body,affect
9,people,how it,helps


In [17]:
for i in range(len(kg_df['source'])):
    if kg_df['source'][i] == '' or kg_df['target'][i] == '' or kg_df['edge'][i] == '':
        kg_df.drop(index=i, inplace=True)

kg_df

Unnamed: 0,source,target,edge
1,Ayurveda,life,means
3,Ayurveda,natural herbs,uses natural
4,Ayurveda,person,considers
5,everyone,different energy,says
8,They,also body,affect
9,people,how it,helps
10,Ayurveda,also diseases,helps
12,Ayurveda,beliefs,based on
14,Ayurveda,ayurveda,is important


In [18]:
kg_df.to_csv("ayurveda_usable.csv")