# import data

In [1]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # charts/plots
%matplotlib inline
import re


# books on my Goodreads shelves
books = pd.read_csv('../input/goodreads-data-778-books/goodreads_library_export.csv')
# Goodreads book database with genres
genres = pd.read_json('../input/books-with-genres/goodreads_book_genres_initial.json', lines=True)
pd.set_option('display.max_columns', None)


In [2]:
books.head()

Unnamed: 0,Book Id,Title,Author,Author l-f,Additional Authors,ISBN,ISBN13,My Rating,Average Rating,Publisher,Binding,Number of Pages,Year Published,Original Publication Year,Date Read,Date Added,Bookshelves,Bookshelves with positions,Exclusive Shelf,My Review,Spoiler,Private Notes,Read Count,Recommended For,Recommended By,Owned Copies,Original Purchase Date,Original Purchase Location,Condition,Condition Description,BCID
0,16793,Stardust,Neil Gaiman,"Gaiman, Neil",,0061142026,9780061000000.0,0,4.08,Harper Perennial,Paperback,248.0,2006.0,1998.0,,5/26/2021,to-read,to-read (#189),to-read,,,,0,,,0,,,,,
1,6294,"Howl’s Moving Castle (Howl’s Moving Castle, #1)",Diana Wynne Jones,"Jones, Diana Wynne",,006441034X,9780064000000.0,0,4.25,Harper Trophy,Mass Market Paperback,329.0,2001.0,1986.0,,5/26/2021,to-read,to-read (#188),to-read,,,,0,,,0,,,,,
2,49392855,The Two Towers,J.R.R. Tolkien,"Tolkien, J.R.R.",Alan Lee,0008376131,9780008000000.0,0,4.45,HarperCollins,Hardcover,385.0,2020.0,1954.0,,10/8/2020,"owned, currently-reading","owned (#36), currently-reading (#3)",currently-reading,,,,2,,,1,,,unspecified,,
3,6568440,Notes from Underground,Fyodor Dostoyevsky,"Dostoyevsky, Fyodor",,,,4,4.15,,Kindle Edition,,,1864.0,5/26/2021,2/18/2021,"owned, reread","owned (#189), reread (#37)",read,,,,1,,,1,,,unspecified,,
4,54419239,Meet Me in Paradise,Libby Hubscher,"Hubscher, Libby",,0593199421,9780593000000.0,0,4.04,Berkley Books,Paperback,352.0,2021.0,2021.0,,5/15/2021,"owned, currently-reading","owned (#119), currently-reading (#2)",currently-reading,,,,1,,,1,,,unspecified,,


In [3]:
genres.head()

Unnamed: 0,book_id,genres
0,5333265,"{'history, historical fiction, biography': 1}"
1,1333909,"{'fiction': 219, 'history, historical fiction,..."
2,7327624,"{'fantasy, paranormal': 31, 'fiction': 8, 'mys..."
3,6066819,"{'fiction': 555, 'romance': 23, 'mystery, thri..."
4,287140,{'non-fiction': 3}


# clean genre data

In [4]:
def convert_genres_to_list(genres):
    genres_as_list = []
    for genre_array in genres:
        list_of_chars = ''
        list_of_chars = [c for c in genre_array if c not in "'{}:1234567890"]
        removed_filler_characters = ''
        for c in list_of_chars:
            removed_filler_characters += c
        genres_as_list.append(removed_filler_characters)
    return genres_as_list

genres.genres = genres.genres.apply(convert_genres_to_list)
genres.head()

Unnamed: 0,book_id,genres
0,5333265,"[history, historical fiction, biography]"
1,1333909,"[fiction, history, historical fiction, biography]"
2,7327624,"[fantasy, paranormal, fiction, mystery, thrill..."
3,6066819,"[fiction, romance, mystery, thriller, crime]"
4,287140,[non-fiction]


# what genres do I read?

In [10]:
# make join table on BookId
books_with_genres = pd.merge(books, genres, left_on = 'Book Id', right_on = 'book_id')
books_with_genres
# make a new row for each genre
# group by genre
# pick out the top 5 genres (that aren't fiction)
# create pie chart

Unnamed: 0,Book Id,Title,Author,Author l-f,Additional Authors,ISBN,ISBN13,My Rating,Average Rating,Publisher,Binding,Number of Pages,Year Published,Original Publication Year,Date Read,Date Added,Bookshelves,Bookshelves with positions,Exclusive Shelf,My Review,Spoiler,Private Notes,Read Count,Recommended For,Recommended By,Owned Copies,Original Purchase Date,Original Purchase Location,Condition,Condition Description,BCID,book_id,genres
0,16793,Stardust,Neil Gaiman,"Gaiman, Neil",,0061142026,9.780061e+12,0,4.08,Harper Perennial,Paperback,248.0,2006.0,1998.0,,5/26/2021,to-read,to-read (#189),to-read,,,,0,,,0,,,,,,16793,"[fantasy, paranormal, fiction, young-adult, ro..."
1,6294,"Howl’s Moving Castle (Howl’s Moving Castle, #1)",Diana Wynne Jones,"Jones, Diana Wynne",,006441034X,9.780064e+12,0,4.25,Harper Trophy,Mass Market Paperback,329.0,2001.0,1986.0,,5/26/2021,to-read,to-read (#188),to-read,,,,0,,,0,,,,,,6294,"[fantasy, paranormal, young-adult, fiction, ro..."
2,6568440,Notes from Underground,Fyodor Dostoyevsky,"Dostoyevsky, Fyodor",,,,4,4.15,,Kindle Edition,,,1864.0,5/26/2021,2/18/2021,"owned, reread","owned (#189), reread (#37)",read,,,,1,,,1,,,unspecified,,,6568440,"[fiction, history, historical fiction, biography]"
3,20493888,"Guards! Guards! (Discworld, #8)",Terry Pratchett,"Pratchett, Terry",,1473200180,9.781473e+12,0,4.29,Gollancz,Hardcover,344.0,2014.0,1989.0,,5/25/2021,"to-read, tbr","to-read (#187), tbr (#28)",to-read,,,,0,,,0,,,,,,20493888,"[fantasy, paranormal, fiction, comics, graphic..."
4,3577453,Golem XIV,Stanisław Lem,"Lem, Stanisław",Jozef Marušiak,8088965713,,0,4.02,Drewo a srd,,165.0,2003.0,1973.0,,5/25/2021,to-read,to-read (#186),to-read,,,,0,,,0,,,,,,3577453,"[fiction, fantasy, paranormal, young-adult]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,1934,Little Women,Louisa May Alcott,"Alcott, Louisa May",,0451529308,9.780452e+12,3,4.10,Signet Classics,Paperback,449.0,2004.0,1868.0,,10/6/2017,,,read,,,,1,,,0,,,,,,1934,"[fiction, history, historical fiction, biograp..."
600,5107,The Catcher in the Rye,J.D. Salinger,"Salinger, J.D.",,0316769177,9.780317e+12,3,3.81,Back Bay Books,Paperback,277.0,2001.0,1951.0,,10/6/2017,,,read,,,,1,,,0,,,,,,5107,"[fiction, young-adult, history, historical fic..."
601,15751404,"David and Goliath: Underdogs, Misfits, and the...",Malcolm Gladwell,"Gladwell, Malcolm",,0316204366,9.780316e+12,5,3.95,"Little, Brown and Company",Hardcover,305.0,2013.0,2013.0,,10/6/2017,,,read,,,,1,,,0,,,,,,15751404,"[non-fiction, history, historical fiction, bio..."
602,6452796,Drive: The Surprising Truth About What Motivat...,Daniel H. Pink,"Pink, Daniel H.",,1594488843,9.781594e+12,4,3.94,Riverhead Books,Hardcover,242.0,2009.0,2009.0,,10/6/2017,,,read,,,,1,,,0,,,,,,6452796,[non-fiction]


# how popular are the books I read?

In [6]:
# sort by rating
# put top 10 in a bar chart

# what are the worst books I read?

In [7]:
# sort by rating
# put bottom 10 in a bar chart

# sources

data source: https://github.com/MengtingWan/goodreads