**Modules**

In [1]:
import pandas as pd
import numpy as np

**Reading all books data**

In [2]:
all_books = pd.read_parquet('../../Datasets/Processed/books_SE_v5.parquet', columns=[
    "book_id",
    "title_without_series",
])

In [3]:
all_books.shape

(2113033, 2)

In [4]:
all_books.head()

Unnamed: 0,book_id,title_without_series
0,5333265,W.C. Fields: A Life on Film
1,1333909,Good Harbor
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ..."
3,6066819,Best Friends Forever
4,287140,Runic Astrology: Starcraft and Timekeeping in ...


**Reading genres data**

In [5]:
book_genres = pd.read_parquet("../../Datasets/Processed/books_with_genre_RE_v1.parquet")

In [6]:
book_genres.shape

(1951142, 2)

In [7]:
book_genres.head()

Unnamed: 0,book_id,genres
0,5333265,"[history, historical fiction, biography]"
1,1333909,"[fiction, history, historical fiction, biography]"
2,7327624,"[fantasy, paranormal, fiction, mystery, thrill..."
3,6066819,"[fiction, romance, mystery, thriller, crime]"
4,287140,[non-fiction]


**Left join**

In [8]:
all_book_genres = pd.merge(all_books, book_genres, how="left", on="book_id")

In [9]:
all_book_genres.shape

(2113033, 3)

In [10]:
all_book_genres.head()

Unnamed: 0,book_id,title_without_series,genres
0,5333265,W.C. Fields: A Life on Film,"[history, historical fiction, biography]"
1,1333909,Good Harbor,"[fiction, history, historical fiction, biography]"
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","[fantasy, paranormal, fiction, mystery, thrill..."
3,6066819,Best Friends Forever,"[fiction, romance, mystery, thriller, crime]"
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,[non-fiction]


In [11]:
all_book_genres.isnull().sum()

book_id                      0
title_without_series         0
genres                  332756
dtype: int64

**Enumerated genres list**

In [12]:
genres = [
    'unknown',
    'biography',
    'children',
    'comics',
    'crime',
    'fantasy',
    'fiction',
    'graphic',
    'historical fiction',
    'history',
    'mystery',
    'non-fiction',
    'paranormal',
    'poetry',
    'romance',
    'thriller',
    'young-adult'
 ]

In [13]:
enumerated_genres = list(enumerate(genres, start=1))

In [14]:
enumerated_genres

[(1, 'unknown'),
 (2, 'biography'),
 (3, 'children'),
 (4, 'comics'),
 (5, 'crime'),
 (6, 'fantasy'),
 (7, 'fiction'),
 (8, 'graphic'),
 (9, 'historical fiction'),
 (10, 'history'),
 (11, 'mystery'),
 (12, 'non-fiction'),
 (13, 'paranormal'),
 (14, 'poetry'),
 (15, 'romance'),
 (16, 'thriller'),
 (17, 'young-adult')]

### Preparing final data by mapping dataframes

**Adding index column separately**

In [15]:
all_book_genres["database_index"] = all_book_genres.index + 1

In [16]:
all_book_genres.head()

Unnamed: 0,book_id,title_without_series,genres,database_index
0,5333265,W.C. Fields: A Life on Film,"[history, historical fiction, biography]",1
1,1333909,Good Harbor,"[fiction, history, historical fiction, biography]",2
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","[fantasy, paranormal, fiction, mystery, thrill...",3
3,6066819,Best Friends Forever,"[fiction, romance, mystery, thriller, crime]",4
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,[non-fiction],5


**Preparing mapping dictionary**

In [17]:
genre_map_dict = dict()

for i,genre in enumerated_genres:
    genre_map_dict[genre] = i

In [18]:
genre_map_dict

{'unknown': 1,
 'biography': 2,
 'children': 3,
 'comics': 4,
 'crime': 5,
 'fantasy': 6,
 'fiction': 7,
 'graphic': 8,
 'historical fiction': 9,
 'history': 10,
 'mystery': 11,
 'non-fiction': 12,
 'paranormal': 13,
 'poetry': 14,
 'romance': 15,
 'thriller': 16,
 'young-adult': 17}

**datatype of genres column values**

In [19]:
type(all_book_genres["genres"][0])

numpy.ndarray

**Performing the mapping**

In [53]:
def map_genre(genre_list):
    if (type(genre_list) is np.ndarray):
        mapped_genre_list = list(map(lambda x: genre_map_dict.get(x), genre_list))
        return mapped_genre_list
    else:
        return [1]

In [54]:
all_book_genres["genres_mapped"] = all_book_genres["genres"].map(map_genre)

In [55]:
all_book_genres.head()

Unnamed: 0,book_id,title_without_series,genres,database_index,genres_mapped
0,5333265,W.C. Fields: A Life on Film,"[history, historical fiction, biography]",1,"[10, 9, 2]"
1,1333909,Good Harbor,"[fiction, history, historical fiction, biography]",2,"[7, 10, 9, 2]"
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","[fantasy, paranormal, fiction, mystery, thrill...",3,"[6, 13, 7, 11, 16, 5, 14]"
3,6066819,Best Friends Forever,"[fiction, romance, mystery, thriller, crime]",4,"[7, 15, 11, 16, 5]"
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,[non-fiction],5,[12]


In [56]:
all_book_genres.isnull().sum()

book_id                      0
title_without_series         0
genres                  332756
database_index               0
genres_mapped                0
dtype: int64

**Exporting the dataframe**

In [57]:
all_book_genres.to_parquet("./all_books_with_genres.parquet")