In [1]:
import pandas as pd

In [2]:
#read in csv files
ratings = pd.read_csv('ratings.csv')
books = pd.read_csv('books.csv')
tags_xwalk = pd.read_csv('tags.csv')
book_tags = pd.read_csv('book_tags.csv')

In [3]:
#merge book titles and ids to the book tags dataframe, then merge in tag names
book_names = books[['book_id', 'goodreads_book_id', 'title']]
book_tags_w_names = pd.merge(book_tags, tags_xwalk, how='left', on=['tag_id'])
book_tags_w_names = pd.merge(book_tags_w_names, book_names, how='left', on=['goodreads_book_id'])

In [4]:
#sort the tags by how often they appear for each book
book_tag = book_tags_w_names[['book_id', 'title','tag_id', 'count']]
book_tag = book_tag.sort_values(by='count')
book_tag.head(25)

Unnamed: 0,book_id,title,tag_id,count
922053,1935,Kindle Paperwhite User's Guide,17246,-1
922054,1935,Kindle Paperwhite User's Guide,6552,-1
922055,1935,Kindle Paperwhite User's Guide,2272,-1
959611,7803,Kindle User's Guide,9221,-1
922052,1935,Kindle Paperwhite User's Guide,21619,-1
922051,1935,Kindle Paperwhite User's Guide,10197,-1
29176,6663,The Taste of Home Cookbook,10188,1
29190,6663,The Taste of Home Cookbook,25491,1
29171,6663,The Taste of Home Cookbook,5732,1
29172,6663,The Taste of Home Cookbook,21676,1


Removing books with negative tags:

The books with negative tags are just Kindle Paperwhite User's Guide, and Kindle User's Guide

In [5]:
book_tags_w_names = book_tags_w_names[book_tags_w_names['count'] > 0] 

## Genres 

We want to keep the most popular genre for each book. 
We will go through and find the most popular genre tags first.

In [6]:
book_tag = book_tags_w_names[['tag_id', 'tag_name','count']]
book_tag = book_tag.groupby(['tag_id', 'tag_name'], as_index = False).sum()
book_tag = book_tag.sort_values(by='count', ascending=False)
book_tag.head(50)

Unnamed: 0,tag_id,tag_name,count
30572,30574,to-read,140718761
8715,8717,currently-reading,7507958
11555,11557,favorites,4503173
11741,11743,fiction,3688819
11303,11305,fantasy,3548157
33112,33114,young-adult,1848306
7455,7457,classics,1756920
5206,5207,books-i-own,1317235
26136,26138,romance,1231926
22741,22743,owned,1224279


Most popular genres:
1. fiction, adult-fiction, general-fiction
2. fantasy, magic
3. young-adult, ya, teen 
4. classics, literature
5. romance
6. mystery, thriller, crime, suspense
7. non-fiction, nonfiction
8. memoir, biography
9. historical, history
10. historical-fiction 
11. science-fiction, sci-fi, sci-fi-fantasy
12. horror
13. childrens, children, children-s, kids, children-s-books 
14. graphic-novels, comics, graphic-novel
15. adventure
16. dystopian, dystopia
17. humor
18. chick-lit

In [7]:
#Create a genre column
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "fiction", 'genre'] = 'Fiction'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "adult-fiction", 'genre'] = 'Fiction'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "general-fiction", 'genre'] = 'Fiction'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "fantasy", 'genre'] = 'Fantasy'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "magic", 'genre'] = 'Fantasy'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "young-adult", 'genre'] = 'Young Adult'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "ya", 'genre'] = 'Young Adult'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "teen", 'genre'] = 'Young Adult'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "classics", 'genre'] = 'Classics'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "literature", 'genre'] = 'Classics'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "romance", 'genre'] = 'Romance'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "mystery", 'genre'] = 'Mystery'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "thriller", 'genre'] = 'Mystery'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "crime", 'genre'] = 'Mystery'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "suspense", 'genre'] = 'Mystery'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "non-fiction", 'genre'] = 'Non-fiction'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "nonfiction", 'genre'] = 'Non-fiction'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "memoir", 'genre'] = 'Biography'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "biography", 'genre'] = 'Biography'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "historical", 'genre'] = 'History'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "history", 'genre'] = 'History'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "historical-fiction", 'genre'] = 'Historical Fiction'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "science-ficiton", 'genre'] = 'Science Fiction'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "sci-fi", 'genre'] = 'Science Fiction'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "sci-fi-fantasy", 'genre'] = 'Science Fiction'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "horror", 'genre'] = 'Fantasy'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "childrens", 'genre'] = 'Childrens'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "children", 'genre'] = 'Childrens'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "children-s", 'genre'] = 'Childrens'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "kids", 'genre'] = 'Childrens'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "children-s-books", 'genre'] = 'Childrens'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "graphic-novels", 'genre'] = 'Graphic Novel'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "comics", 'genre'] = 'Graphic Novel'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "graphic-novel", 'genre'] = 'Graphic Novel'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "adventure", 'genre'] = 'Adventure'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "dystopian", 'genre'] = 'Dystopian'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "dystopia", 'genre'] = 'Dystopian'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "chick-lit", 'genre'] = 'Chick-lit'
book_tags_w_names.loc[book_tags_w_names['tag_name'] == "humor", 'genre'] = 'Humor'

book_tags_w_names.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name,book_id,title,genre
0,1,30574,167697,to-read,27,Harry Potter and the Half-Blood Prince (Harry ...,
1,1,11305,37174,fantasy,27,Harry Potter and the Half-Blood Prince (Harry ...,Fantasy
2,1,11557,34173,favorites,27,Harry Potter and the Half-Blood Prince (Harry ...,
3,1,8717,12986,currently-reading,27,Harry Potter and the Half-Blood Prince (Harry ...,
4,1,33114,12716,young-adult,27,Harry Potter and the Half-Blood Prince (Harry ...,Young Adult


In [8]:
#count number of unique books
count = book_tags_w_names.groupby('book_id').nunique()
count.shape

(10000, 7)

In [9]:
#now drop books that dont have genres
book_tags_w_names2 = book_tags_w_names[book_tags_w_names.genre.notnull()]
book_tags_w_names2.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name,book_id,title,genre
1,1,11305,37174,fantasy,27,Harry Potter and the Half-Blood Prince (Harry ...,Fantasy
4,1,33114,12716,young-adult,27,Harry Potter and the Half-Blood Prince (Harry ...,Young Adult
5,1,11743,9954,fiction,27,Harry Potter and the Half-Blood Prince (Harry ...,Fiction
9,1,32989,4364,ya,27,Harry Potter and the Half-Blood Prince (Harry ...,Young Adult
12,1,18886,3374,magic,27,Harry Potter and the Half-Blood Prince (Harry ...,Fantasy


We still have 10,000 books, meaning all of our books mapped to one of these genres

In [10]:
count = book_tags_w_names2.groupby('book_id').nunique()
count.shape

(10000, 7)

In [11]:
book_tags_w_names2 =book_tags_w_names2.groupby(['book_id','title','goodreads_book_id','genre'], as_index = False).sum()

In [12]:
book_tags_w_names2 = book_tags_w_names2.sort_values(['book_id', 'count'], ascending=[True, False])
book_tags_w_names2.head(50)

Unnamed: 0,book_id,title,goodreads_book_id,genre,tag_id,count
7,1,"The Hunger Games (The Hunger Games, #1)",2767052,Young Adult,95555,38567
1,1,"The Hunger Games (The Hunger Games, #1)",2767052,Dystopian,20123,24050
3,1,"The Hunger Games (The Hunger Games, #1)",2767052,Fiction,11743,13819
2,1,"The Hunger Games (The Hunger Games, #1)",2767052,Fantasy,11305,10836
6,1,"The Hunger Games (The Hunger Games, #1)",2767052,Science Fiction,53556,7330
5,1,"The Hunger Games (The Hunger Games, #1)",2767052,Romance,26138,3341
0,1,"The Hunger Games (The Hunger Games, #1)",2767052,Adventure,1691,3190
4,1,"The Hunger Games (The Hunger Games, #1)",2767052,Mystery,59434,1441
11,2,Harry Potter and the Sorcerer's Stone (Harry P...,3,Fantasy,30191,51780
15,2,Harry Potter and the Sorcerer's Stone (Harry P...,3,Young Adult,95555,20927


In [13]:
#take the top genre for each book
books_w_genres = book_tags_w_names2.groupby('book_id').first().reset_index()
books_w_genres.head(10)

Unnamed: 0,book_id,title,goodreads_book_id,genre,tag_id,count
0,1,"The Hunger Games (The Hunger Games, #1)",2767052,Young Adult,95555,38567
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,3,Fantasy,30191,51780
2,3,"Twilight (Twilight, #1)",41865,Young Adult,95555,24398
3,4,To Kill a Mockingbird,2657,Classics,25824,33573
4,5,The Great Gatsby,4671,Classics,25824,34758
5,6,The Fault in Our Stars,11870085,Young Adult,95555,26277
6,7,The Hobbit,5907,Fantasy,30191,42613
7,8,The Catcher in the Rye,5107,Classics,25824,24304
8,9,"Angels & Demons (Robert Langdon, #1)",960,Mystery,88890,12397
9,10,Pride and Prejudice,1885,Classics,25824,37947


In [14]:
books_w_genres.shape

(10000, 6)

In [15]:
books_w_genres['genre'].value_counts().to_dict()

{'Fiction': 1698,
 'Fantasy': 1449,
 'Non-fiction': 1311,
 'Mystery': 1165,
 'Young Adult': 1001,
 'Romance': 761,
 'Childrens': 571,
 'Classics': 538,
 'Historical Fiction': 375,
 'Graphic Novel': 365,
 'Science Fiction': 311,
 'History': 121,
 'Chick-lit': 119,
 'Biography': 105,
 'Dystopian': 67,
 'Humor': 39,
 'Adventure': 4}

In [16]:
books_w_genres[books_w_genres['genre']=="Adventure"]

Unnamed: 0,book_id,title,goodreads_book_id,genre,tag_id,count
3350,3351,"The Mysterious Island (Extraordinary Voyages, ...",32831,Adventure,1691,372
5763,5764,"Valhalla Rising (Dirk Pitt, #16)",198331,Adventure,1691,14
6307,6308,"Wolf Brother (Chronicles of Ancient Darkness, #1)",295305,Adventure,1691,150
8122,8123,Trust No One (The 39 Clues: Cahills vs. Vesper...,12107927,Adventure,1691,68


Dropping "Adventure" as a genre, as there are only 4 books in this category

In [17]:
books_w_genres = books_w_genres[books_w_genres['genre']!="Adventure"]
books_w_genres.shape

(9996, 6)

In [18]:
books_w_genres[books_w_genres['genre']=="Humor"]

Unnamed: 0,book_id,title,goodreads_book_id,genre,tag_id,count
742,743,"Lamb: The Gospel According to Biff, Christ's C...",28881,Humor,15048,1912
942,943,Holidays on Ice,4136,Humor,15048,1582
1095,1096,America (The Book): A Citizen's Guide to Democ...,706,Humor,15048,1583
1152,1153,"A Dirty Job (Grim Reaper, #1)",33456,Humor,15048,1281
1330,1331,Go the Fuck to Sleep,11192642,Humor,15048,901
1922,1923,Barrel Fever: Stories and Essays,4143,Humor,15048,816
2033,2034,"Bloodsucking Fiends (A Love Story, #1)",33454,Humor,15048,847
2578,2579,SantaLand Diaries,178357,Humor,15048,381
2695,2696,Put Me in the Zoo,413158,Humor,15048,19
2922,2923,"Practical Demonkeeping (Pine Cove, #1)",33457,Humor,15048,606


In [19]:
books_w_genres = books_w_genres[['book_id', 'goodreads_book_id', 'title', 'genre']]
books_w_genres.head()

Unnamed: 0,book_id,goodreads_book_id,title,genre
0,1,2767052,"The Hunger Games (The Hunger Games, #1)",Young Adult
1,2,3,Harry Potter and the Sorcerer's Stone (Harry P...,Fantasy
2,3,41865,"Twilight (Twilight, #1)",Young Adult
3,4,2657,To Kill a Mockingbird,Classics
4,5,4671,The Great Gatsby,Classics


In [20]:
genres = books_w_genres['genre'].unique()
genres

array(['Young Adult', 'Fantasy', 'Classics', 'Mystery', 'Fiction',
       'Non-fiction', 'Romance', 'Historical Fiction', 'Childrens',
       'Science Fiction', 'Chick-lit', 'Dystopian', 'Biography',
       'History', 'Graphic Novel', 'Humor'], dtype=object)

In [21]:
for g in genres:
    print(g)
len(genres)

Young Adult
Fantasy
Classics
Mystery
Fiction
Non-fiction
Romance
Historical Fiction
Childrens
Science Fiction
Chick-lit
Dystopian
Biography
History
Graphic Novel
Humor


16

In [22]:
df_dict = {}
for genre in genres:
    df_dict[genre] = pd.DataFrame()

for key in df_dict.keys():
    df_dict[key] = books_w_genres[:][books_w_genres.genre == key]

In [23]:
df_dict['Young Adult'].head()

Unnamed: 0,book_id,goodreads_book_id,title,genre
0,1,2767052,"The Hunger Games (The Hunger Games, #1)",Young Adult
2,3,41865,"Twilight (Twilight, #1)",Young Adult
5,6,11870085,The Fault in Our Stars,Young Adult
11,12,13335037,"Divergent (Divergent, #1)",Young Adult
16,17,6148028,"Catching Fire (The Hunger Games, #2)",Young Adult


In [24]:
for key in df_dict.keys():
    df_dict[key] = df_dict[key].merge(ratings, how='left', on='book_id')
    print(df_dict[key].head())

   book_id  goodreads_book_id                                    title  \
0        1            2767052  The Hunger Games (The Hunger Games, #1)   
1        1            2767052  The Hunger Games (The Hunger Games, #1)   
2        1            2767052  The Hunger Games (The Hunger Games, #1)   
3        1            2767052  The Hunger Games (The Hunger Games, #1)   
4        1            2767052  The Hunger Games (The Hunger Games, #1)   

         genre  user_id  rating  
0  Young Adult     2886       5  
1  Young Adult     6158       5  
2  Young Adult     3991       4  
3  Young Adult     5281       5  
4  Young Adult     5721       5  
   book_id  goodreads_book_id  \
0        2                  3   
1        2                  3   
2        2                  3   
3        2                  3   
4        2                  3   

                                               title    genre  user_id  rating  
0  Harry Potter and the Sorcerer's Stone (Harry P...  Fantasy        4 

   book_id  goodreads_book_id  \
0      743              28881   
1      743              28881   
2      743              28881   
3      743              28881   
4      743              28881   

                                               title  genre  user_id  rating  
0  Lamb: The Gospel According to Biff, Christ's C...  Humor       31       4  
1  Lamb: The Gospel According to Biff, Christ's C...  Humor       75       3  
2  Lamb: The Gospel According to Biff, Christ's C...  Humor      245       5  
3  Lamb: The Gospel According to Biff, Christ's C...  Humor      228       5  
4  Lamb: The Gospel According to Biff, Christ's C...  Humor      325       3  


In [26]:
for key in df_dict.keys():
    print(key)
    print(df_dict[key].shape)
    df_dict[key].to_pickle("dataframes/" + key + ".pkl")

Young Adult
(661833, 6)
Fantasy
(1013987, 6)
Classics
(766126, 6)
Mystery
(538687, 6)
Fiction
(1140144, 6)
Non-fiction
(517476, 6)
Romance
(247597, 6)
Historical Fiction
(230654, 6)
Childrens
(373396, 6)
Science Fiction
(189795, 6)
Chick-lit
(58408, 6)
Dystopian
(39097, 6)
Biography
(33352, 6)
History
(45146, 6)
Graphic Novel
(104264, 6)
Humor
(15638, 6)
