In [347]:
# Project Github link - https://github.com/somya-anchalia/book-discovery-app

In [348]:
# Import libraries from NLTK to Tokenize a sentence
import nltk 
from nltk.tokenize import word_tokenize

# Import libraries and punctuation data
import re
from string import punctuation

# Import libraries for Lemmatization
from nltk.stem import WordNetLemmatizer

# POS Tagging
from nltk.tag import pos_tag
import stanza
from pattern.text.en import singularize
from SPARQLWrapper import SPARQLWrapper2
from tabulate import tabulate

In [349]:
# Method to normalize and lammatize

def normalize_and_lemmatize(text):
    processed_text =  re.sub(f"[{re.escape(punctuation)}]", "", text)
    processed_text = " ".join(processed_text.split())
    wordnet_lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(processed_text)
    required_words = [wordnet_lemmatizer.lemmatize(x, 'v') for x in  tokens]
    sentence_with_lemmnatized_word = " ".join(required_words)
    return sentence_with_lemmnatized_word

# Method to process text
def text_processing(text):
    return normalize_and_lemmatize(text)

# Method for POS tagging
def pos_tags_creation(sent):
    return nltk.pos_tag(nltk.word_tokenize(sent))



In [350]:
# Bag of keywords

properties_list = [['has_genre','has genres', 'with genre','genre'], 
             ['written_by','written by', 'by author', 'author'], 
             ['receive_award', 'receive award', 'received', 'receive', 'awarded', 'award'], 
             ['first_published_country', 'first published country', 'first', 'published', 'country'], 
             ['has_category', 'has category', 'category']]

classes_list = [['book', 'books'], 
              ['author', 'authors', 'writer'], 
              ['person', 'people'], 
              ['award', 'awards'], 
              ['country', 'countries'], 
              ['category', 'categories']]

In [351]:
# Import Stanza to recognise Entities

def fetch_named_entities(text):
    nlp = stanza.Pipeline(lang='en', processors="tokenize,ner", download_method=stanza.DownloadMethod.NONE)
    results = nlp(text)
    return results.entities

# Mapping the bag of keywords

def property_mappings(word, properties_list):
    mapped_property = ""
    index = [[i, prop.index(word)]
             for i, prop in enumerate(properties_list)
             if word in prop]
    if len(index) > 0:
        mapped_property = properties_list[index[0][0]][0]
    return mapped_property

def classes_mappings(word, classes_list):
    mapped_class = ""
    index = [[i, cls.index(word)]
             for i, cls in enumerate(classes_list)
             if word in cls]
    if len(index) > 0:
        mapped_class = classes_list[index[0][0]][0]
    return mapped_class

In [352]:
# Queries

text = "all books with Thriller genre"
# text = "all books by author Chetan Bhagat"
# text = "all books awarded with Goodreads Choice Awards"
# text = "all books with published country as India"
# text = "all books Top Rated category"

In [353]:
# Process User Text

processed_text = text_processing(text)
tags = pos_tags_creation(processed_text)
tags

[('all', 'DT'),
 ('book', 'NN'),
 ('with', 'IN'),
 ('Thriller', 'NNP'),
 ('genre', 'NN')]

In [354]:
# Map and create Classes and Properties

def map_class_or_props(entity, classes, properties, classes_list, properties_list):
    if hasattr(entity, "label"): 
         print("check entity: ",entity)
    else:
        if entity[1] == "NNS":
            mapped_class = classes_mappings(singularize(entity[0]), classes_list)
            if mapped_class!="":
                classes.append(mapped_class.capitalize())
        elif entity[1] == "VBP":
            mapped_property = property_mappings(entity[0], properties_list)
            if mapped_property!="": 
                properties.append(mapped_property)
        elif entity[1] == "NN":
            mapped_property = property_mappings(entity[0], properties_list)
            mapped_class = classes_mappings(singularize(entity[0]), classes_list)
            if mapped_property!="": 
                properties.append(mapped_property)
            elif mapped_class!="":
                classes.append(mapped_class.capitalize())
    return classes, properties

def create_classes_properties(tagged_chuncks, classes, properties, classes_list, properties_list):
    for entity in tagged_chuncks:
        classes, properties = map_class_or_props(entity, classes, properties, classes_list, properties_list)
    return classes, properties

In [355]:
# Create Individuals

def create_individuals(text, individuals, classes_list):
    named_entities = fetch_named_entities(text)
    for e in named_entities:
        print(e)
        mapped_ind_class = classes_mappings(singularize(e.type.lower()), classes_list)
        if mapped_ind_class!="":
            individuals[e.text] = mapped_ind_class.capitalize()
        else:
            individuals[e.text] = "not_mapped"
    return individuals

In [356]:
# Iterate over the named entities and print their labels

tagged_chuncks = nltk.ne_chunk(tags) 
classes = []
properties = []
individuals = {}

# CREATE CLASSES AND PROPERTIES
classes, properties = create_classes_properties(tagged_chuncks, classes, properties, classes_list, properties_list)

# CREATE INDIVIDUALS
individuals = create_individuals(text, individuals, classes_list)

print("Classes: ",classes)
print("Properties: ",properties)
print("Individuals: ",individuals)

2023-11-27 11:44:36 INFO: Loading these models for language: en (English):
| Processor | Package          |
--------------------------------
| tokenize  | combined         |
| ner       | ontonotes_charlm |

2023-11-27 11:44:36 INFO: Using device: cpu
2023-11-27 11:44:36 INFO: Loading: tokenize
2023-11-27 11:44:36 INFO: Loading: ner


check entity:  (PERSON Thriller/NNP)


2023-11-27 11:44:37 INFO: Done loading processors!


{
  "text": "Thriller",
  "type": "WORK_OF_ART",
  "start_char": 15,
  "end_char": 23
}
Classes:  ['Book']
Properties:  ['has_genre']
Individuals:  {'Thriller': 'not_mapped'}


In [357]:
# Create classes triple

def make_classes_triple(classes, individuals, class_triple):
    if len(classes)>0:
        class_triple = "{?y a book:"+classes[0]+"}"
    if(len(individuals)>0):
        class_triple = class_triple + "{?x (book:|!book:)|^(book:|!book:)* ?y}"
    return class_triple

In [358]:
# Create Properties triple

def make_properties_triple(properties, prop_triple):
    prop_triple = "{?x book:"+properties[0]+"|^book:"+properties[0]+" ?y}"
    return prop_triple

In [359]:
# Create Individuals triple

def make_individual_triple(properties, individuals, individual_triple):
    index = next(iter(individuals))
    if(individuals[index]=="not_mapped"):
        if(len(properties)==0):
            individual_triple = "{?x a ?y} {?y book:name ?bname} {?x book:name ?name} FILTER(?name='"+index+"')"            
        else:
            individual_triple = "{?y book:name ?bname} {?x book:name ?name} FILTER(?name='"+index+"')"
    else:
         individual_triple = "{?x a book:"+individuals[index]+"} {?y book:name ?bname} {?x book:name ?name} FILTER(?name='"+index+"')"
    return individual_triple


In [360]:
# Create SPARQL Query

def make_sparql_query(class_triple, prop_triple, individual_triple):
    query_start =      """
                    PREFIX book:<http://www.book-discovery.com/ontologies#>
                    SELECT *
                    WHERE{
                    """
    query_end =    "}"

    return query_start+class_triple+prop_triple+individual_triple+query_end

In [361]:
sparql = SPARQLWrapper2("http://localhost:3030/BookDiscoveryAppQueries/query")

class_triple = ""
prop_triple = ""
individual_triple = ""

# Create classes triple
class_triple = make_classes_triple(classes, individuals, class_triple)

# Create Properties triple
prop_triple = make_properties_triple(properties, prop_triple)

# Create Individuals triple
if len(individuals)>0:
    individual_triple = make_individual_triple(properties, individuals, individual_triple)


query = make_sparql_query(class_triple, prop_triple, individual_triple)
print(query)

sparql.setQuery(query)
results = sparql.query().bindings


                    PREFIX book:<http://www.book-discovery.com/ontologies#>
                    SELECT *
                    WHERE{
                    {?y a book:Book}{?x (book:|!book:)|^(book:|!book:)* ?y}{?x book:has_genre|^book:has_genre ?y}{?y book:name ?bname} {?x book:name ?name} FILTER(?name='Thriller')}


In [167]:
# Function to create 2D array to show results in Tabular format

def get_query_results(table_columns, results):
    table_values = [table_columns]
    for result in results:
        column_values = [result["bname"].value, result["name"].value, result["y"].value]
        table_values.append(column_values)
    return table_values

In [168]:
# Query1: All books with Thriller genre

table_columns = ['Book', 'Genre', 'Book URI']
table_values = get_query_results(table_columns, results)

print(tabulate(table_values, headers="firstrow"))

Book          Genre     Book URI
------------  --------  -----------------------------------------------------
Life Of Pi    Thriller  http://www.book-discovery.com/ontologies#life_of_pi
Sacred Games  Thriller  http://www.book-discovery.com/ontologies#sacred_games
400 days      Thriller  http://www.book-discovery.com/ontologies#400_days


In [97]:
# Query2: All books by author Chetan Bhagat

table_columns = ['Book', 'Author', 'Book URI']
table_values = get_query_results(table_columns, results)

print(tabulate(table_values, headers="firstrow"))

Book             Author         Book URI
---------------  -------------  --------------------------------------------------------
Revolution 2020  Chetan Bhagat  http://www.book-discovery.com/ontologies#revolution_2020
400 days         Chetan Bhagat  http://www.book-discovery.com/ontologies#400_days


In [109]:
# Query3: All books awarded with Goodreads Choice Awards

table_columns = ['Book', 'Award', 'Book URI']
table_values = get_query_results(table_columns, results)

print(tabulate(table_values, headers="firstrow"))

Book             Award                    Book URI
---------------  -----------------------  --------------------------------------------------------
A Promised Land  Goodreads Choice Awards  http://www.book-discovery.com/ontologies#a_promised_land
Revolution 2020  Goodreads Choice Awards  http://www.book-discovery.com/ontologies#revolution_2020
400 days         Goodreads Choice Awards  http://www.book-discovery.com/ontologies#400_days


In [121]:
# Query4: All books with published country as India

table_columns = ['Book', 'Country', 'Book URI']
table_values = get_query_results(table_columns, results)

print(tabulate(table_values, headers="firstrow"))

Book             Country    Book URI
---------------  ---------  --------------------------------------------------------
Revolution 2020  India      http://www.book-discovery.com/ontologies#revolution_2020
Sacred Games     India      http://www.book-discovery.com/ontologies#sacred_games
400 days         India      http://www.book-discovery.com/ontologies#400_days


In [133]:
# Query5: All books Top Rated category

table_columns = ['Book', 'Category', 'Book URI']
table_values = get_query_results(table_columns, results)

print(tabulate(table_values, headers="firstrow"))

Book             Category    Book URI
---------------  ----------  --------------------------------------------------------
Harry Potter     Top Rated   http://www.book-discovery.com/ontologies#harry_potter
Revolution 2020  Top Rated   http://www.book-discovery.com/ontologies#revolution_2020
400 days         Top Rated   http://www.book-discovery.com/ontologies#400_days


In [None]:
# Python code end #