In [1]:
import pandas as pd
import json
import gzip

**Streaming the data without unzipping the complete file into the memory**
- Reading the first line or record

In [6]:
with gzip.open("../../1_Source/books.json.gz") as f:
    line = f.readline()

line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

**json to dict**

In [3]:
json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

**List of all columns or keys**

In [4]:
print(list(json.loads(line).keys()))

print(len(list(json.loads(line).keys())))

['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code', 'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin', 'similar_books', 'description', 'format', 'link', 'authors', 'publisher', 'num_pages', 'publication_day', 'isbn13', 'publication_month', 'edition_information', 'publication_year', 'url', 'image_url', 'book_id', 'ratings_count', 'work_id', 'title', 'title_without_series']
29


**We will consider the following columns for the Search Engine**
- `['isbn', 'country_code', 'language_code', 'average_rating', 'description', 'link', 'publisher', 'num_pages', 'publication_day', 'isbn13', 'publication_month', 'publication_year', 'url', 'image_url', 'book_id', 'ratings_count', 'title', 'title_without_series']`

In [6]:
cols = ['isbn', 'country_code', 'language_code', 'average_rating', 'description', 'link', 'publisher', 'num_pages', 'publication_day', 'isbn13', 'publication_month', 'publication_year', 'url', 'image_url', 'book_id', 'ratings_count', 'title', 'title_without_series']

print(cols)
print(len(cols))

remaining_cols = list(set(json.loads(line).keys()) - set(cols))
remaining_cols.sort()

print(remaining_cols)
print(len(remaining_cols))

['isbn', 'country_code', 'language_code', 'average_rating', 'description', 'link', 'publisher', 'num_pages', 'publication_day', 'isbn13', 'publication_month', 'publication_year', 'url', 'image_url', 'book_id', 'ratings_count', 'title', 'title_without_series']
18
['asin', 'authors', 'edition_information', 'format', 'is_ebook', 'kindle_asin', 'popular_shelves', 'series', 'similar_books', 'text_reviews_count', 'work_id']
11


**Creating dictionary structure**

In [7]:
temp_dict = dict()

for col in cols:
    temp_dict[col] = f"dict_data['{col}']"

temp_dict

{'isbn': "dict_data['isbn']",
 'country_code': "dict_data['country_code']",
 'language_code': "dict_data['language_code']",
 'average_rating': "dict_data['average_rating']",
 'description': "dict_data['description']",
 'link': "dict_data['link']",
 'publisher': "dict_data['publisher']",
 'num_pages': "dict_data['num_pages']",
 'publication_day': "dict_data['publication_day']",
 'isbn13': "dict_data['isbn13']",
 'publication_month': "dict_data['publication_month']",
 'publication_year': "dict_data['publication_year']",
 'url': "dict_data['url']",
 'image_url': "dict_data['image_url']",
 'book_id': "dict_data['book_id']",
 'ratings_count': "dict_data['ratings_count']",
 'title': "dict_data['title']",
 'title_without_series': "dict_data['title_without_series']"}

**Parser function from json to dictionary**

In [9]:
def parse_fields(json_line):
    dict_data = json.loads(json_line)

    return {
        'isbn': dict_data['isbn'],
        'country_code': dict_data['country_code'],
        'language_code': dict_data['language_code'],
        'average_rating': dict_data['average_rating'],
        'description': dict_data['description'],
        'link': dict_data['link'],
        'publisher': dict_data['publisher'],
        'num_pages': dict_data['num_pages'],
        'publication_day': dict_data['publication_day'],
        'isbn13': dict_data['isbn13'],
        'publication_month': dict_data['publication_month'],
        'publication_year': dict_data['publication_year'],
        'url': dict_data['url'],
        'image_url': dict_data['image_url'],
        'book_id': dict_data['book_id'],
        'ratings_count': dict_data['ratings_count'],
        'title': dict_data['title'],
        'title_without_series': dict_data['title_without_series']
    }

In [10]:
parse_fields(line)

{'isbn': '0312853122',
 'country_code': 'US',
 'language_code': '',
 'average_rating': '4.00',
 'description': '',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'title': 'W.C. Fields: A Life on Film',
 'title_without_series': 'W.C. Fields: A Life on Film'}

**Reading the whole file in streaming fashion**

In [11]:
books = []

with gzip.open("../../1_Source/books.json.gz") as f:
    while True:
        # reading the line
        line = f.readline()

        # we will break the infinite loop when we reach the end of the dataset file
        if not line:
            break
        
        # parsing the line
        fields = parse_fields(line)
        books.append(fields)

**Total number of books**

In [12]:
len(books)

2360655

In [13]:
print(books[0])

{'isbn': '0312853122', 'country_code': 'US', 'language_code': '', 'average_rating': '4.00', 'description': '', 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields', 'publisher': "St. Martin's Press", 'num_pages': '256', 'publication_day': '1', 'isbn13': '9780312853129', 'publication_month': '9', 'publication_year': '1984', 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields', 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg', 'book_id': '5333265', 'ratings_count': '3', 'title': 'W.C. Fields: A Life on Film', 'title_without_series': 'W.C. Fields: A Life on Film'}


**Creating a DataFrame**

In [14]:
items = pd.DataFrame.from_dict(books)

**Rows and Columns of the DataFrame**

In [15]:
print(f"Rows: {items.shape[0]}")
print(f"Columns: {items.shape[1]}")

Rows: 2360655
Columns: 18


**We don't need the `books` object anymore**

In [None]:
del(books)

**Exporting the dataframe as parquet**

In [20]:
items.to_parquet('../../3_ProcessedData/books_SE_v1.parquet', index=True, compression="snappy")

**Exporting the dataframe as compressed json where each line will be a json object**

In [21]:
with gzip.open('../../3_ProcessedData/books_SE_v1.json.gz', 'wt', encoding='utf-8') as file:
    file.write(items.to_json(orient='records', lines=True))