**Modules**

In [7]:
import pandas as pd
import gzip
import json

**Reading data sample - json.gz**

In [13]:
with gzip.open("../1_DataSource/goodreads_book_genres_initial.json.gz", 'rt') as f:
    line = f.readline()

line

'{"book_id": "5333265", "genres": {"history, historical fiction, biography": 1}}\n'

In [14]:
json.loads(line)

{'book_id': '5333265', 'genres': {'history, historical fiction, biography': 1}}

**Parsing function**

In [15]:
def parse_fields(json_line):
    dict_data = json.loads(json_line)

    return {
        'book_id': dict_data['book_id'],
        'genres': dict_data['genres'],
    }

**Reading complete data**

In [81]:
book_genres = []

with gzip.open("../1_DataSource/goodreads_book_genres_initial.json.gz", 'rt') as f:
    while True:
        # reading the line
        line = f.readline()

        # we will break the infinite loop when we reach the end of the dataset file
        if not line:
            break
        
        # parsing the line
        fields = parse_fields(line)
        book_genres.append(fields)

In [82]:
len(book_genres)

2360655

**Looking at genres**

In [83]:
for item in book_genres[10:15]:
    print(item["genres"])

{'fiction': 19, 'history, historical fiction, biography': 38, 'mystery, thriller, crime': 38}
{'fantasy, paranormal': 11, 'mystery, thriller, crime': 12, 'fiction': 3, 'history, historical fiction, biography': 4}
{'non-fiction': 266, 'fiction': 33, 'young-adult': 3, 'poetry': 3, 'children': 3}
{}
{'fiction': 1760, 'mystery, thriller, crime': 31, 'history, historical fiction, biography': 12}


In [84]:
for item in book_genres[10:15]:
    print(list(item["genres"].keys()))

['fiction', 'history, historical fiction, biography', 'mystery, thriller, crime']
['fantasy, paranormal', 'mystery, thriller, crime', 'fiction', 'history, historical fiction, biography']
['non-fiction', 'fiction', 'young-adult', 'poetry', 'children']
[]
['fiction', 'mystery, thriller, crime', 'history, historical fiction, biography']


- In the 10th item of book_genres, although it looks like there are 7 genres but there are 3 as per the item count.
  - This is because `'history, historical fiction, biography'` and `'mystery, thriller, crime'` are single genres.
  - We need to reconstruct the genres list

**Restructuring genres**

In [85]:
for index, item in enumerate(book_genres):
    genres_list = list(item["genres"].keys())
    
    total_genres = []
    for i in genres_list:
        total_genres.extend(i.split(","))

    book_genres[index]["genres"] = total_genres

**Genres structure updated - crosscheck**

In [86]:
for item in book_genres[10:15]:
    print(item["genres"])

['fiction', 'history', ' historical fiction', ' biography', 'mystery', ' thriller', ' crime']
['fantasy', ' paranormal', 'mystery', ' thriller', ' crime', 'fiction', 'history', ' historical fiction', ' biography']
['non-fiction', 'fiction', 'young-adult', 'poetry', 'children']
[]
['fiction', 'mystery', ' thriller', ' crime', 'history', ' historical fiction', ' biography']


**Removing unwanted space from genres**

In [87]:
for index, item in enumerate(book_genres):
    genres_list = list(item["genres"])
    
    for i,genre in enumerate(genres_list):
        genres_list[i] = genre.strip(" ")

    book_genres[index]["genres"] = genres_list

**Unwanted space removed - Crosscheck**

In [88]:
for item in book_genres[10:15]:
    print(item["genres"])

['fiction', 'history', 'historical fiction', 'biography', 'mystery', 'thriller', 'crime']
['fantasy', 'paranormal', 'mystery', 'thriller', 'crime', 'fiction', 'history', 'historical fiction', 'biography']
['non-fiction', 'fiction', 'young-adult', 'poetry', 'children']
[]
['fiction', 'mystery', 'thriller', 'crime', 'history', 'historical fiction', 'biography']


**Extracting unique genres**

In [89]:
unique_genres_list = list()

for i in book_genres:
    unique_genres_list.extend(i["genres"])

In [90]:
len(unique_genres_list)

8126034

In [91]:
unique_genres_list[:10]

['history',
 'historical fiction',
 'biography',
 'fiction',
 'history',
 'historical fiction',
 'biography',
 'fantasy',
 'paranormal',
 'fiction']

In [94]:
unique_genres = list(set(unique_genres_list))
unique_genres.sort()

In [97]:
unique_genres

['biography',
 'children',
 'comics',
 'crime',
 'fantasy',
 'fiction',
 'graphic',
 'historical fiction',
 'history',
 'mystery',
 'non-fiction',
 'paranormal',
 'poetry',
 'romance',
 'thriller',
 'young-adult']

In [96]:
len(unique_genres)

16

**Create dataframe**

In [99]:
bookid_genre = pd.DataFrame.from_dict(book_genres)

In [100]:
bookid_genre.shape

(2360655, 2)

In [101]:
bookid_genre.head()

Unnamed: 0,book_id,genres
0,5333265,"[history, historical fiction, biography]"
1,1333909,"[fiction, history, historical fiction, biography]"
2,7327624,"[fantasy, paranormal, fiction, mystery, thrill..."
3,6066819,"[fiction, romance, mystery, thriller, crime]"
4,287140,[non-fiction]


In [102]:
bookid_genre.tail()

Unnamed: 0,book_id,genres
2360650,3084038,"[non-fiction, history, historical fiction, bio..."
2360651,26168430,"[mystery, thriller, crime, children, fiction]"
2360652,2342551,"[poetry, children, young-adult, non-fiction]"
2360653,22017381,"[romance, mystery, thriller, crime]"
2360654,11419866,"[romance, fiction]"


In [106]:
print(bookid_genre["genres"][0])
print(len(bookid_genre["genres"][0]))

['history', 'historical fiction', 'biography']
3


**List of books without genres**

In [111]:
bookid_genre.loc[bookid_genre['genres'].apply(lambda x: len(x) < 1)]

Unnamed: 0,book_id,genres
13,28575155,[]
20,30227122,[]
26,287142,[]
27,16037548,[]
36,24994796,[]
...,...,...
2360615,35213446,[]
2360617,33640526,[]
2360625,31248182,[]
2360635,31624372,[]


**We will capture the index**

In [112]:
no_genre_indices = bookid_genre.loc[bookid_genre['genres'].apply(lambda x: len(x) < 1)].index
no_genre_indices

Index([     13,      20,      26,      27,      36,      39,      46,      47,
            53,      62,
       ...
       2360596, 2360597, 2360606, 2360613, 2360614, 2360615, 2360617, 2360625,
       2360635, 2360646],
      dtype='int64', length=409513)

**Dropping the records without genre**

In [113]:
books_with_genre = bookid_genre.drop(index=no_genre_indices)

In [114]:
books_with_genre.shape

(1951142, 2)

In [115]:
books_with_genre.head()

Unnamed: 0,book_id,genres
0,5333265,"[history, historical fiction, biography]"
1,1333909,"[fiction, history, historical fiction, biography]"
2,7327624,"[fantasy, paranormal, fiction, mystery, thrill..."
3,6066819,"[fiction, romance, mystery, thriller, crime]"
4,287140,[non-fiction]


In [116]:
books_with_genre.tail()

Unnamed: 0,book_id,genres
2360650,3084038,"[non-fiction, history, historical fiction, bio..."
2360651,26168430,"[mystery, thriller, crime, children, fiction]"
2360652,2342551,"[poetry, children, young-adult, non-fiction]"
2360653,22017381,"[romance, mystery, thriller, crime]"
2360654,11419866,"[romance, fiction]"


**Exporting the dataframe**

In [117]:
books_with_genre.to_parquet('../3_ProcessedData/books_with_genre_RE_v1.parquet', index=True, compression="snappy")