# Information Extraction with NLTK

Author : Shubhajit Basak <br/>

In [1]:
import nltk
import re
from statistics import mode

In [2]:
inputfile='football_players.txt' #Location of the file
buf=open(inputfile, encoding="UTF-8")
list_of_doc=buf.read().split('\n')

# Remove the Blank Lines from the Document Provided
list_of_doc = [doc for doc in list_of_doc if len(doc)>0]

# Task 1
A function that takes each document and performs:
1) sentence segmentation 2) tokenization 3) part-of-speech tagging

In [3]:
def ie_preprocess(document):
    #code goes here
    sentences = nltk.sent_tokenize(document)  #step 1 Sentence Segmentation
    sentences = [nltk.word_tokenize(sent) for sent in sentences]  #step 2 Word Tokenization
    pos_sentences = [nltk.pos_tag(sent) for sent in sentences]  # step 3 POS Tagging
    return pos_sentences


Run the following code to check the result for the first document (Ronaldo).

In [4]:
first_doc=list_of_doc[0]
pos_sent=ie_preprocess(first_doc)
pos_sent

[[('Cristiano', 'NNP'),
  ('Ronaldo', 'NNP'),
  ('dos', 'NN'),
  ('Santos', 'NNP'),
  ('Aveiro', 'NNP'),
  (',', ','),
  ('ComM', 'NNP'),
  (',', ','),
  ('GOIH', 'NNP'),
  ('(', '('),
  ('born', 'VBN'),
  ('5', 'CD'),
  ('February', 'NNP'),
  ('1985', 'CD'),
  (')', ')'),
  ('is', 'VBZ'),
  ('a', 'DT'),
  ('Portuguese', 'JJ'),
  ('professional', 'JJ'),
  ('footballer', 'NN'),
  ('who', 'WP'),
  ('plays', 'VBZ'),
  ('for', 'IN'),
  ('Spanish', 'JJ'),
  ('club', 'NN'),
  ('Real', 'NNP'),
  ('Madrid', 'NNP'),
  ('and', 'CC'),
  ('the', 'DT'),
  ('Portugal', 'NNP'),
  ('national', 'JJ'),
  ('team', 'NN'),
  ('.', '.')],
 [('He', 'PRP'),
  ('is', 'VBZ'),
  ('a', 'DT'),
  ('forward', 'NN'),
  ('and', 'CC'),
  ('serves', 'NNS'),
  ('as', 'IN'),
  ('captain', 'NN'),
  ('for', 'IN'),
  ('Portugal', 'NNP'),
  ('.', '.')],
 [('In', 'IN'),
  ('2008', 'CD'),
  (',', ','),
  ('he', 'PRP'),
  ('won', 'VBD'),
  ('his', 'PRP$'),
  ('first', 'JJ'),
  ('Ballon', 'NNP'),
  ("d'Or", 'NN'),
  ('and', 'CC')

Expected output
 [...[('He', 'PRP'),
  ('is', 'VBZ'),
  ('a', 'DT'),
  ('forward', 'NN'),
  ('and', 'CC'),
  ('serves', 'NNS'),
  ('as', 'IN'),
  ('captain', 'NN'),
  ('for', 'IN'),
  ('Portugal', 'NNP'),
  ('.', '.')], ...]

# Task 2
A function that will take the list of tokens with POS tags for each sentence and returns the named entities (NE). 

In [5]:
def named_entity_finding(pos_sent):
    #Code goes here
    tree = nltk.ne_chunk(pos_sent, binary=True)
    #print(tree)
    named_entities = []
    for subtree in tree.subtrees():
        if subtree.label() == 'NE':
            entity = ""
            for leaf in subtree.leaves():
                entity = entity + leaf[0] + " "
            named_entities.append(entity.strip())
            #print(named_entities)
    return named_entities

pos_sents=ie_preprocess(list_of_doc[0])
print(named_entity_finding(pos_sents[0]))

['Cristiano Ronaldo', 'Santos Aveiro', 'ComM', 'GOIH', 'Portuguese', 'Spanish', 'Real Madrid', 'Portugal']


Expected output ['Cristiano Ronaldo',
 'Santos Aveiro',
 'ComM',
 'GOIH',
 'Portuguese',
 'Portuguese',
 'Spanish',
 'Real Madrid',
 'Portugal']

# Task 3

Use the named_entity_finding() function to extract all NEs for each document.

In [6]:
def NE_flat_list_fn(pos_sents): 
    NE=[]
    for pos_sent in pos_sents:
        #Single line code here. Call the funtion named_entity_finding(pos_sent) and 
        #append the result to the NE list
        NE.append(named_entity_finding(pos_sent))
    #Single line code here. Flatten the list of lists to the single list NE_flat_list
    NE_flat_list =[val for sublist in NE for val in sublist]
    return NE_flat_list

print(NE_flat_list_fn(pos_sents))

['Cristiano Ronaldo', 'Santos Aveiro', 'ComM', 'GOIH', 'Portuguese', 'Spanish', 'Real Madrid', 'Portugal', 'Portugal', 'Ballon', 'FIFA', 'FIFA Ballon', 'Ronaldo', 'Ronaldo', 'Portuguese', 'Portuguese Football Federation', 'European Golden Shoe', 'ESPN', 'Ronaldo', 'Manchester United', 'England', 'United', 'UEFA Champions League', 'FIFA Club', 'Ballon', 'FIFA', 'Manchester United', 'Madrid', 'Spain', 'Ronaldo', 'UEFA Champions League', 'Ronaldo', 'La Liga', 'Ronaldo', 'UEFA Champions League', 'Real Madrid', 'La Liga', 'Lionel Messi', 'Ronaldo', 'Portugal', 'Portugal', 'European', 'FIFA World Cups', 'Portuguese', 'Portugal', 'Portugal', 'Portugal', 'Ronaldo', 'UEFA European', 'European', 'Michel Platini', 'Ronaldo', 'Portugal', 'France', 'Silver Boot']


# Task 4

Functions to extract the name of the player, country of origin and date of birth as well as the following relations: team(s) of the player and position(s) of the player.

Reference: https://docs.python.org/3/howto/regex.html

In [7]:
# Defining search function which search for a text and return n words before the search term
# This will be used toget the Country of Origin
def search(text,n,search):
    '''Searches for text, and retrieves n words before the search term, which are retuned seperatly'''
    word = r"\W*([\w]+)"
    groups = re.search(r'{}\W*{}'.format(word*n,search), text).groups()
    return groups[:n]

In [8]:
def name_of_the_player(doc):
    #code goes here
    # Hint: Use the named_entity_finding() function
    # Remove all the words with in double quotes as Name can't have double quotes
    for phrase in re.findall('"([^"]*)"', doc):
        doc = doc.replace('"{}"'.format(phrase), '')
    # POS Tag the sentence
    pos_sents=ie_preprocess(doc)
    # Define Grammer Rule that contains consecutive Nouns (Proper Noun or Nouns or Foreign Words)
    grammar = "NP:{<NNP|NN|FW>+}"
    cp = nltk.RegexpParser(grammar)
    # Create the Tree with the Grammer Rule for the First Sentence only as we know it contains the name
    tree = cp.parse(pos_sents[0])  
    for subtree in tree.subtrees():
        if subtree.label() == 'NP': # Get the first match
            entity = ""
            for leaf in subtree.leaves():
                entity = entity + leaf[0] + " "
            return entity.strip() # Return the first match

def country_of_origin(doc):
    #code goes here
    # Get the word that occurs before the phrase 'national team'
    country = search(doc,1,'national team')[0]
    return country

def date_of_birth(doc):
    #code goes here
    sentences = ie_preprocess(doc)
    for sent in sentences:
        # Define the Grammer Rule to get the DOB and search for the first occurance
        grammar = "CHUNK: {<VB.*|JJ> <CD> <NNP> <CD>}" 
        cp = nltk.RegexpParser(grammar)
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            # Search and Return the First Occurance
            if subtree.label() == 'CHUNK': #or 'VP'
                entity = ""
                for leaf in subtree.leaves()[1:4]:
                    entity = entity + leaf[0] + " "
                #print(entity)
                return entity.strip()

def team_of_the_player(doc):
    #code goes here
    team=[]
    sentences = ie_preprocess(doc)
    # Phrase a grammer tule to get the related sentence
    grammar = "TEAM:{<IN><JJ|NN|NNP>+}" 
    cp = nltk.RegexpParser(grammar)
    for sent in sentences:
        sent1 = nltk.ne_chunk(sent, binary = False)
        tree = cp.parse(sent)
        #print(tree)
        for subtree in tree.subtrees():
            # Filter Out the prepositions like behind, by
            if (subtree.label() == 'TEAM') & ('NNP' in str(subtree)) & (subtree.leaves()[0][0] not in ['behind','by']) :
                teamSen = ' '.join([w for w, t in subtree.leaves()])
                #print(teamSen)
                for sub in sent1.subtrees():
                    # Check if the Noun Phrases are Organization and Present in the Grammer Rule Sentence
                    if(sub.label() in ['ORGANIZATION']):
                        orgSen = ' '.join([w for w, t in sub.leaves()])
                        if((orgSen in teamSen) & (sub.leaves()[0][1] =='NNP')):
                            # Construct the Team Name
                            team.append(orgSen)
    # Append the Country of origin with the Club Team Names
    team.append(country_of_origin(doc))
    return list(set(team))

def position_of_the_player(doc):
    #code goes here
    # Check for common football positions
    position = re.findall("forward|midfield*|striker|winger", doc)
    pos1= list(set(position))
    #pos = ','.join(pos1)
    return pos1

Execute the below command to check your fuction


In [9]:
date_of_birth(list_of_doc[2])

'5 February 1992'

Expected output '5 February 1992'

In [10]:
name_of_the_player(list_of_doc[5])

'Zlatan Ibrahimović'

In [11]:
country_of_origin(list_of_doc[5])

'Sweden'

In [12]:
team_of_the_player(list_of_doc[4])

['UEFA', 'England', 'Manchester United']

In [13]:
position_of_the_player(list_of_doc[5])

['striker']

# Task 5

Create a function using the outputs from the previous functions to generate JSON-LD output as follows.

Reference: https://json-ld.org/primer/latest/

{ "@id": "http://my-soccer-ontology.com/footballer/name_of_the_player",

    "name": "",
    "born": "",
    "country": "",
    "position": [
        { "@id": "http://my-soccer-ontology.com/position",
            "type": ""
        }
     ]   
     "team": [
        { "@id": "http://my-soccer-ontology.com/team",
            "name": ""
        }   
     ]
}


In [14]:
from pyld import jsonld
import json
import unidecode

In [15]:
# arg1=name
# arg2=country
# arg3=born
# arg4=team
# arg5=position

def generate_jsonld(arg1,arg2,arg3,arg4,arg5):
    #Code goes here
    #Hint: arg1,arg2,..... are the arguments you will be passing to the function
    #json data
    doc = {
        

        "@id": "http://my-soccer-ontology.com/footballer/",
        "http://schema.org/name": unidecode.unidecode(arg1),
        "http://schema.org/country": arg2,    
        "http://schema.org/born": arg3,
        "http://schema.org/team":{"@id":"http://schema.org/team", "@type": list(map(unidecode.unidecode,arg4))},
        "http://schema.org/position":{"@id":"http://schema.org/position", "@type":arg5}
    }
    
    #Interpretation of doc (context)
    context = {

        "name": "http://schema.org/name",
        "country": "http://schema.org/country",
        "born": "http://schema.org/born",
        "team": {"@id": "http://schema.org/team", 
                 "@type": "@id"},
        "position":{"@id":"http://schema.org/position",
                    "@type":"@id"}

        }
    #On the basis of context compacting the document
    compacted = jsonld.compact(doc, context)
    compacted.pop("@context")
    return json.dumps(compacted, indent=2)


In [16]:
for i in range(0,len(list_of_doc)):

    #extracting player information like name,nationality,date of birth,team,position, and awards.
    arg1=name_of_the_player(list_of_doc[i])
    arg2=country_of_origin(list_of_doc[i])
    arg3=date_of_birth(list_of_doc[i])
    arg4=team_of_the_player(list_of_doc[i])
    arg5=position_of_the_player(list_of_doc[i])
    
    #printing the generated json-ld
    print(generate_jsonld(arg1,arg2,arg3,arg4,arg5))

{
  "@id": "http://my-soccer-ontology.com/footballer/",
  "born": "5 February 1985",
  "country": "Portugal",
  "name": "Cristiano Ronaldo dos Santos Aveiro",
  "position": {
    "@id": "http://schema.org/position",
    "@type": "/forward"
  },
  "team": {
    "@id": "http://schema.org/team",
    "@type": [
      "/Portugal",
      "/Euro",
      "/Manchester United",
      "/Real Madrid"
    ]
  }
}
{
  "@id": "http://my-soccer-ontology.com/footballer/",
  "born": "24 June 1987",
  "country": "Argentina",
  "name": "Lionel Andres Messi",
  "position": {
    "@id": "http://schema.org/position",
    "@type": "/forward"
  },
  "team": {
    "@id": "http://schema.org/team",
    "@type": [
      "/FC Barcelona",
      "/Argentina"
    ]
  }
}
{
  "@id": "http://my-soccer-ontology.com/footballer/",
  "born": "5 February 1992",
  "country": "Brazil",
  "name": "Neymar da Silva Santos Junior",
  "position": {
    "@id": "http://schema.org/position",
    "@type": "/forward"
  },
  "team": {
  

# Task 6
Identify one other relation (besides team and player) and write a function to extract this. Also extend the JSON-LD output accordingly.

The other relation can be award won by the players

In [17]:
import re
def awards(doc):#this function returns a list containing (if at all) awards won by the player.
    list1=[]
    list2=[]
    #searches for texts containing awards
    awards_sent = re.findall(r'[^.]* award[^.]', doc)
    g = "FOUND:{(<VBD>|(<TO><VB>))<.*>*}" # Define the Grammer Rule
    for a in awards_sent:
        pos_tagged_words =ie_preprocess(a)
        chunkParser = nltk.RegexpParser(g)
        tree = chunkParser.parse(pos_tagged_words[0])
        for subtree in tree.subtrees():
            if (subtree.label() == 'FOUND'):
                award = nltk.RegexpParser("AWARD:{<NNP><.*>*}").parse(subtree) # Find all consecutive Noun Phrases
                for sub in award.subtrees():
                    if(sub.label() in ['AWARD']):
                        l = len(sub.leaves())
                        if(sub.leaves()[l-2][1] in ['NNP','NN']):
                            awardSen = ' '.join([w for w, t in sub.leaves()][:-1])
                            awardSen = awardSen.split('and')
                            list1.append(awardSen)
    list2 = [val for sublist in list1 for val in sublist]
    return list(set(list2))

In [18]:
awards(list_of_doc[0])

[' FIFA World Player of the Year', 'Golden Shoe', "Ballon d'Or "]

In [19]:
# arg1=name
# arg2=country
# arg3=born
# arg4=team
# arg5=position
# arg6=awards

def generate_jsonld_extended(arg1,arg2,arg3,arg4,arg5,arg6):
    #Code goes here
    #Hint: arg1,arg2,..... are the arguments you will be passing to the function
    #json data
    doc = {
        

        "@id": "http://my-soccer-ontology.com/footballer/",
        "http://schema.org/name": unidecode.unidecode(arg1),
        "http://schema.org/country": arg2,    
        "http://schema.org/born": arg3,
        "http://schema.org/team":{"@id":"http://schema.org/team", "@type": list(map(unidecode.unidecode,arg4))},
        "http://schema.org/position":{"@id":"http://schema.org/position", "@type":arg5},
        "http://schema.org/award":{"@id":"http://schema.org/award", "@type":arg6}
    }
    
    #Interpretation of doc (context)
    context = {

        "name": "http://schema.org/name",
        "country": "http://schema.org/country",
        "born": "http://schema.org/born",
        "team": {"@id": "http://schema.org/team", 
                 "@type": "@id"},
        "position":{"@id":"http://schema.org/position",
                    "@type":"@id"},
        "award":{"@id":"http://schema.org/award",
                    "@type":"@id"}

        }
    #On the basis of context compacting the document
    compacted = jsonld.compact(doc, context)
    compacted.pop("@context")
    return json.dumps(compacted, indent=2)


In [20]:
for i in range(0,len(list_of_doc)):

    #extracting player information like name,nationality,date of birth,team,position, and awards.
    arg1=name_of_the_player(list_of_doc[i])
    arg2=country_of_origin(list_of_doc[i])
    arg3=date_of_birth(list_of_doc[i])
    arg4=team_of_the_player(list_of_doc[i])
    arg5=position_of_the_player(list_of_doc[i])
    arg6=awards(list_of_doc[i])
    
    #printing the generated json-ld
    print(generate_jsonld_extended(arg1,arg2,arg3,arg4,arg5,arg6))

{
  "@id": "http://my-soccer-ontology.com/footballer/",
  "award": {
    "@id": "http://schema.org/award",
    "@type": [
      "/ FIFA World Player of the Year",
      "/Golden Shoe",
      "/Ballon d'Or "
    ]
  },
  "born": "5 February 1985",
  "country": "Portugal",
  "name": "Cristiano Ronaldo dos Santos Aveiro",
  "position": {
    "@id": "http://schema.org/position",
    "@type": "/forward"
  },
  "team": {
    "@id": "http://schema.org/team",
    "@type": [
      "/Portugal",
      "/Euro",
      "/Manchester United",
      "/Real Madrid"
    ]
  }
}
{
  "@id": "http://my-soccer-ontology.com/footballer/",
  "award": {
    "@id": "http://schema.org/award",
    "@type": [
      "/ FIFA World Player of the Year",
      "/ FIFA World Player of the Year award",
      "/Ballon d'Or "
    ]
  },
  "born": "24 June 1987",
  "country": "Argentina",
  "name": "Lionel Andres Messi",
  "position": {
    "@id": "http://schema.org/position",
    "@type": "/forward"
  },
  "team": {
    "@id