In [1]:
# module imports

from configparser import ConfigParser
from mysql.connector import MySQLConnection,Error
from datetime import datetime

import gzip
import json

In [2]:
# reading database credentials from config.ini file

def read(filename='config.ini',section='mysql'):
    parser=ConfigParser()
    parser.read(filename)
    
    db={}
    
    if parser.has_section(section):
        items=parser.items(section)
        for item in items:
            db[item[0]]=item[1]
    else:
        raise Exception(f'{section} not found in file {filename}')
    return db 


print(read(filename="config.ini",section="mysql"))

{'host': '127.0.0.1', 'port': '99966', 'database': 'gr_search_engine', 'user': 'root', 'password': 'Milla123!'}


In [3]:
# connecting with MySQL/MariaDB database server and getting the connection and cursor object

def connect(creds):
    con=None
    try:
        print('Connecting to MySQL database...')
        con=MySQLConnection(**creds)
        
        
        if con.is_connected():
            print('Connection established')
            cus = con.cursor(buffered=True)
        else:
            print('Connection failed')
            
    except Error as e:
        print(e)
    finally:
        return con,cus
    
cn,cs=connect(creds=read(filename="config.ini",section="mysql"))

Connecting to MySQL database...
Connection established


In [4]:
# required columns

required_columns = [
"book_id",
"gr_book_id",
"title",
"mod_title",
"ratings_count",
"average_rating",
"link",
"url",
"image_url",
"publication_day",
"publication_month",
"publication_year",
"num_pages",
"isbn",
"isbn13",
"description",
"publisher",
]

In [9]:
# json parser function into python dictionary

def parse_fields(line):
    data = json.loads(line)
    data_dict = {"book_id" : None}

    data_dict['gr_book_id'] = data['gr_book_id']
    data_dict['title'] = data['title']
    data_dict['mod_title'] = data['mod_title']
    data_dict['ratings_count'] = data['ratings_count']
    data_dict['average_rating'] = data['average_rating']
    data_dict['link'] = data['link']
    data_dict['url'] = data['url']
    data_dict['image_url'] = data['image_url']
    data_dict['publication_day'] = int(data['publication_day'])
    data_dict['publication_month'] = int(data['publication_month'])
    data_dict['publication_year'] = int(data['publication_year'])
    data_dict['num_pages'] = int(data['num_pages'])
    data_dict['isbn'] = int(data['isbn'])
    data_dict['isbn13'] = int(data['isbn13'])
    data_dict['description'] = data['description']
    data_dict['publisher'] = data['publisher']
    
    return data_dict

In [12]:
# reading the first line/record from the compressed json file in a streaming fashion

with gzip.open("../../Datasets/Processed/SE/books_ratingcount_gt15_p2.json.gz") as f:
    line = f.readline()

line

b'{"isbn":0.0,"average_rating":4.03,"similar_books":["19997","828466","1569323","425389","1176674","262740","3743837","880461","2292726","1883810","1808197","625150","1988046","390170","2620131","383106","1597281"],"description":"Omnibus book club edition containing the Ladies of Madrigyn and the Witches of Wenshar.","link":"https:\\/\\/www.goodreads.com\\/book\\/show\\/7327624-the-unschooled-wizard","authors":[{"author_id":"10333","role":""}],"publisher":"Nelson Doubleday, Inc.","num_pages":600.0,"publication_day":0.0,"publication_month":0.0,"publication_year":1987.0,"isbn13":0.0,"url":"https:\\/\\/www.goodreads.com\\/book\\/show\\/7327624-the-unschooled-wizard","image_url":"https:\\/\\/images.gr-assets.com\\/books\\/1304100136m\\/7327624.jpg","gr_book_id":7327624,"ratings_count":140,"title":"The Unschooled Wizard (Sun Wolf and Starhawk, #1-2)","mod_title":"the unschooled wizard sun wolf and starhawk 12"}\r\n'

In [15]:
# parsing json record into dictionary and storing that

test_record = parse_fields(line)
test_record

{'book_id': None,
 'gr_book_id': 7327624,
 'title': 'The Unschooled Wizard (Sun Wolf and Starhawk, #1-2)',
 'mod_title': 'the unschooled wizard sun wolf and starhawk 12',
 'ratings_count': 140,
 'average_rating': 4.03,
 'link': 'https://www.goodreads.com/book/show/7327624-the-unschooled-wizard',
 'url': 'https://www.goodreads.com/book/show/7327624-the-unschooled-wizard',
 'image_url': 'https://images.gr-assets.com/books/1304100136m/7327624.jpg',
 'publication_day': 0,
 'publication_month': 0,
 'publication_year': 1987,
 'num_pages': 600,
 'isbn': 0,
 'isbn13': 0,
 'description': 'Omnibus book club edition containing the Ladies of Madrigyn and the Witches of Wenshar.',
 'publisher': 'Nelson Doubleday, Inc.'}

In [5]:
# SQL insert query => insert one record at a time

# insert function takes following parameters
# the non-transformed raw record as dictionary
# connection and cursor object 

# name of the table
table = "book"

def insert_one(record_json, table=table, cn=cn, cs=cs):
    columns_string = """
    book_id,
    gr_book_id,
    title,
    mod_title,
    ratings_count,
    average_rating,
    link,
    url,
    image_url,
    publication_day,
    publication_month,
    publication_year,
    num_pages,
    isbn,
    isbn13,
    description,
    publisher,
    status, 
    created_id,
    created_dtm,
    modified_id,
    modified_dtm
    """

    # placeholders as string
    place_holders = """
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s,
    %s
    """
    record_dict = parse_fields(record_json)

    # accessing the dict data to prepare the data as tuple
    data_tuple = list(record_dict.values())

    # adding the vales of last 5 standard columns
    data_tuple = tuple(data_tuple + list((1,1,datetime.now(),1,datetime.now())))

    # SQL query to insert the data
    sql_query = f"""
    INSERT INTO {table} ({columns_string})
    VALUES ({place_holders});
    """

    # trying to execute the query
    try:
        cs.execute(sql_query, data_tuple)
        cn.commit()
    # throwing error in case of unsuccessful attempt
    except Error as e:
        raise Exception(f"{e}")

In [19]:
# testing the data insert function

# insert_one(line)

In [10]:
# we will go through all the books from the dataset and insert them one after the other
# runtime 40 m

books = []

with gzip.open("../../Datasets/Processed/SE/books_ratingcount_gt15_p2.json.gz") as f:
    while True:
        # reading the line as json
        line = f.readline()

        # we will break the infinite loop when we reach the end of the dataset file
        if not line:
            break

        # insert function to upload the data into database
        insert_one(line)