In [1]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.metrics.pairwise import cosine_similarity

from configparser import ConfigParser
from mysql.connector import MySQLConnection,Error

import gzip
import json

In [2]:
vectorizer = joblib.load("../Memory/vectorizer_uni.pkl")
tfidf = joblib.load("../Memory/tfidf_matrix_uni.pkl")

In [3]:
def search(query,vectorizer,tfidf_matrix):
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))

    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf_matrix)
    similarity = similarity.flatten().round(5)

    indices = np.argsort(similarity)[-30:]
    indices = np.array(list(reversed(indices)))
    
    top_similarities = similarity[indices]
    SIMILARITY_THRESHOLD = 0.50000
    sims_above_threshold = np.where(top_similarities >= SIMILARITY_THRESHOLD)
    
    if (len(sims_above_threshold[0]) < 1 ):
        return "Noting Relevant Found"
    else:
        max_index = (np.array(sims_above_threshold).max())
        indices = indices[:max_index+1]

    return indices

In [4]:
# reading database credentials from config.ini file

def read(filename='config.ini',section='mysql'):
    parser=ConfigParser()
    parser.read(filename)
    
    db={}
    
    if parser.has_section(section):
        items=parser.items(section)
        for item in items:
            db[item[0]]=item[1]
    else:
        raise Exception(f'{section} not found in file {filename}')
    return db 


print(read(filename="config.ini",section="mysql"))

{'host': '127.0.0.1', 'port': '99966', 'database': 'gr_search_engine', 'user': 'root', 'password': 'Milla123!'}


In [5]:
# connecting with MySQL/MariaDB database server and getting the connection and cursor object

def connect(creds):
    con=None
    try:
        print('Connecting to MySQL database...')
        con=MySQLConnection(**creds)
        
        
        if con.is_connected():
            print('Connection established')
            cus = con.cursor(buffered=True)
        else:
            print('Connection failed')
            
    except Error as e:
        print(e)
    finally:
        return con,cus
    
cn,cs=connect(creds=read(filename="config.ini",section="mysql"))

Connecting to MySQL database...
Connection established


In [6]:
indices = search("goblet of fire",vectorizer,tfidf)
indices

array([1000185,  834624,  480231,  594212,  954525,  299461,  808331,
        970302,  655035, 1231519, 1136518,  239132,  993994,  708204,
        396219, 1243886, 1012102, 1185652,  382013, 1093048, 1017043,
       1175300,  778512, 1027396,   19069,  796287,  340717,  415608,
       1235398, 1005034], dtype=int64)

In [7]:
# index in an array starts with 0 but index (primary key - item_id) in database starts with 1
# we will add 1 to each index number
# then we will convert all the index into string for SQL

def indices_to_str(indices):
    indices_str = ""

    for index in indices:
        indices_str = indices_str + f"{str(index + 1)},"

    # the indices string will end with , => 1,2,3,
    # so, truncating the last comma
    indices_str = indices_str[:-1]

    return indices_str

print(indices)
print(indices_to_str(indices))

[1000185  834624  480231  594212  954525  299461  808331  970302  655035
 1231519 1136518  239132  993994  708204  396219 1243886 1012102 1185652
  382013 1093048 1017043 1175300  778512 1027396   19069  796287  340717
  415608 1235398 1005034]
1000186,834625,480232,594213,954526,299462,808332,970303,655036,1231520,1136519,239133,993995,708205,396220,1243887,1012103,1185653,382014,1093049,1017044,1175301,778513,1027397,19070,796288,340718,415609,1235399,1005035


In [8]:
# SQL get query => get all the records in a single query by item_id
# we need to provide indices in the function along with other parameters

# name of the table
table = "book"

def get_all_recommendation(indices, table=table, cn=cn, cs=cs):
    columns_string = """
    book_id,
    gr_book_id,
    title,
    mod_title,
    ratings_count,
    average_rating,
    link,
    url,
    image_url,
    publication_day,
    publication_month,
    publication_year,
    num_pages,
    isbn,
    isbn13,
    description,
    publisher"""

    # getting the indices as string
    indices_str = indices_to_str(indices)

    # SQL query to get all records by id
    # maintain the arrangement of id as it is without re-arranging them by ascending or descending order
    sql_query = f"""
    SELECT {columns_string} FROM {table}
    WHERE book_id IN ({indices_str})
    ORDER BY FIELD(book_id, {indices_str});
    """

    # trying to execute the query
    try:
        cs.execute(sql_query)
    # throwing error in case of unsuccessful attempt
    except Error as e:
        raise Exception(f"{e}")

In [9]:
get_all_recommendation(indices=indices)

results = cs.fetchall()
results

[(1000186,
  17125270,
  'The Goblet',
  'the goblet',
  19,
  Decimal('3.160'),
  'https://www.goodreads.com/book/show/17125270-the-goblet',
  'https://www.goodreads.com/book/show/17125270-the-goblet',
  'https://images.gr-assets.com/books/1356112197m/17125270.jpg',
  27,
  11,
  2012,
  196,
  1621478734,
  9781621478737,
  "While on a family trip to Israel, Sarah hasn't discovered just any goblet; she has come into possession of something not of this world. Something of Jesus's very own. The trip had been rocky so far. Sarah's husband, Bill, was distant as ever, and her son, Alex, is disagreeable as usual. Once Sarah gets the goblet, though, everything starts to Change. All of a sudden, Alex is enthusiastic about family events. Sarah soon comes to learn that this is no coincidence. Life starts to change back home as well. Sarah and her children go to church the first Sunday at home-which is a feat in itself-only to discover that Sarah's singing voice is inspiring everyone. The churc