# Elise Hu: 2020 books

Quick analysis of the books she read last year.

### Load Python tools 

In [2]:
import pandas as pd
import geopandas as gpd
from urllib.request import urlopen 
import jenkspy
import matplotlib.pyplot as plt
%matplotlib inline
import json
import numpy as np
from altair import datum
import altair as alt

In [3]:
alt.themes.enable('vox')

ThemeRegistry.enable('vox')

In [4]:
plt.rcParams['figure.figsize'] = (16,8)

In [5]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [6]:
pd.options.display.float_format = '{:,.2f}'.format

### Read book data from Elise

In [7]:
url = 'https://spreadsheets.google.com/feeds/download/spreadsheets/Export?key=1Y2lvWJE1pRjvG6e-mJcpXDeANh9EK0nWHJEQIoJrOHM&exportFormat=csv'

In [8]:
# src = pd.read_csv(url, parse_dates=True, infer_datetime_format=True)

In [9]:
src = pd.read_excel('input/elise-2020-books.xls')

In [10]:
src.dtypes

Order                            int64
Title                           object
Author                          object
Pages                          float64
URL                             object
gender                          object
type                            object
Subgenre                        object
Avg GoodReads Rating           float64
Started                 datetime64[ns]
date                    datetime64[ns]
Faves                           object
dtype: object

### Clean up field headers, data types

In [11]:
src.columns = src.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','_')

  src.columns = src.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','_')


In [12]:
src['year'] = src['date'].dt.year.astype(str).str.replace('.0','',regex=False)

In [13]:
src['type'] = src['type'].str.replace(' ', '-')

In [14]:
src['days_to_read'] = src['date'] - src['started']

### Make gender more descriptive, calculate number of reading days

In [15]:
src['gender_description'] = src.gender.replace({'M': 'Male', 'F': 'Female'})

In [16]:
src['days_to_read'] = src['days_to_read'].dt.days

### Now start fresh with a dataframe called 'books'

In [17]:
books = pd.DataFrame(src)

In [20]:
books.head()

Unnamed: 0,order,title,author,pages,url,gender,type,subgenre,avg_goodreads_rating,started,date,faves,year,days_to_read,gender_description
0,1,How to Do Nothing,Jenny Odell,240.0,https://www.goodreads.com/author/show/18614151...,F,Non-Fiction,Self Help,3.97,2019-12-25,2020-01-06,,2020,12,Female
1,2,Becoming Wise,Krista Tippett,288.0,https://www.goodreads.com/book/show/25894085-b...,F,Non-Fiction,Spirituality,4.15,2020-01-01,2020-01-12,Y,2020,11,Female
2,3,Rework,"Jason Fried, David Hansson",279.0,https://www.goodreads.com/book/show/6732019-re...,M,Non-Fiction,Business,3.94,2020-01-12,2020-01-17,,2020,5,Male
3,4,Minor Feelings,Cathy Park Hong,224.0,https://www.goodreads.com/book/show/47544177-m...,F,Non-Fiction,Essays,4.53,2020-01-14,2020-01-26,,2020,12,Female
4,5,The Idiot,Elif Batuman,423.0,https://www.goodreads.com/book/show/30962053-t...,F,Fiction,historical,3.63,2020-01-25,2020-02-02,,2020,8,Female


### When did Elise finish her books?

In [18]:
# DESKTOP 
alt.Chart(books).mark_tick(thickness=2,size=20).encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('year',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' ')
).configure_axis(
    grid=False).properties(height=40, width=700)

In [19]:
# MOBILE 
alt.Chart(books).mark_tick(thickness=2,size=20).encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('year',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' ')
).configure_axis(
    grid=False).properties(height=40, width=320)

### Tinkering with more designs for books read by subgenre and month

In [None]:
#DESKTOP
alt.Chart(books).mark_tick(thickness=3,size=10).encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('subgenre',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' ')
).configure_axis(grid=False).properties(height=400, width=700)

In [None]:
# mobile
alt.Chart(books).mark_tick(thickness=2,size=10).encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('subgenre',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' ')
).configure_axis(grid=False).properties(height=400, width=320)

---

### Let's try books by author gender and month

In [None]:
#DESKTOP
alt.Chart(books).mark_tick(thickness=4,size=20).encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('gender_description',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' '),
    color=alt.Color('gender_description', legend=None)
).configure_axis(
    grid=False).properties(height=100, width=700)

In [None]:
# MOBILE
alt.Chart(books).mark_tick(thickness=2,size=20).encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('gender_description',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' '),
    color=alt.Color('gender_description', legend=None)
).configure_axis(
    grid=False).properties(height=100, width=320)

### Books by type and month

In [None]:
# DESKTOP 
alt.Chart(books).mark_tick(thickness=3,size=20).encode(
    x=alt.X('date',axis=alt.Axis(format='%B', tickColor='#ffffff', tickCount=7), title=''),
    y=alt.Y('type',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' '),
    color=alt.Color('type', legend=None)
).configure_axis(
    grid=False).properties(height=200, width=700, title='Books, by type and date finished')

In [None]:
# MOBILE
alt.Chart(books).mark_tick(thickness=2,size=20).encode(
    x=alt.X('date',axis=alt.Axis(format='%B', tickColor='#ffffff', tickCount=4), title=''),
    y=alt.Y('type',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' '),
    color=alt.Color('type', legend=None)
).configure_axis(
    grid=False).properties(height=200, width=320, title='Books, by type and date finished')

---

### Is there a strong relationship between the number of pages and days to read? No

In [None]:
# Remove handful of books that took months to finish
books_month = books[books['days_to_read'] < 30]

In [None]:
books_corr = books_month[['pages', 'days_to_read']]

In [None]:
corr = books_corr.corr(method ='pearson')

In [None]:
print(corr)

In [None]:
#DESKTOP 
alt.Chart(books_month).mark_circle(size=60).encode(
    x=alt.X('days_to_read:Q', title='Days to read', axis=alt.Axis(tickCount=6)),
    y=alt.Y('pages:Q', title='Number of pages', axis=alt.Axis(tickCount=5)),
    tooltip=['title:N', 'author:O', 'avg_goodreads_rating:Q', 'pages:Q', 'days_to_read:Q']
).properties(width=500, height=500)

In [None]:
# MOBILE
alt.Chart(books_month).mark_circle(size=60).encode(
    x=alt.X('days_to_read:Q', title='Days to read', axis=alt.Axis(tickCount=6)),
    y=alt.Y('pages:Q', title='Number of pages', axis=alt.Axis(tickCount=5)),
    tooltip=['title:N', 'author:O', 'avg_goodreads_rating:Q', 'pages:Q', 'days_to_read:Q']
).properties(width=320, height=320)

---

### Which types of books did Elise read most?

In [None]:
book_type = books.groupby(['type', 'year']).agg('size').reset_index(name='count').sort_values(by='count', ascending=False)

In [None]:
book_type.head()

In [None]:
book_type['share'] = (book_type['count'] / 52)*100

In [None]:
book_type.head()

In [None]:
book_type_chart = alt.Chart(book_type).mark_bar().encode(
    x=alt.X('sum(count)', stack="normalize",  title='Share of books read by genre', \
            axis=alt.Axis(format='%',tickCount=6)),
    y=alt.Y('year',title=''),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'type',
      sort='ascending'
    ),
    color=alt.Color('type', legend=alt.Legend(orient="top", title='Book type'))
)

(book_type_chart).properties(height=40,width=700)

In [None]:
(book_type_chart).properties(height=40,width=320)

---

### Which genre of books did Elise read most?

In [None]:
book_genre = books.groupby('subgenre').agg('size').reset_index(name='count')\
.sort_values(by='count', ascending=False)
book_genre.head()

### Books read by sub-genre

In [None]:
#DESKTOP
book_genre_chart = alt.Chart(book_genre).mark_bar().encode(
    y=alt.Y("subgenre:N", title=' ',
           sort=alt.EncodingSortField(
            field="count:Q",  # The field to use for the sort
            op="sum",  # The operation to run on the field prior to sorting
            order="ascending"  # The order to sort in
        )),
    x=alt.X("count:Q", title=' ', axis=alt.Axis(tickCount=6, format=''))
)

book_genre_chart.properties(height=500, width=700, title='')

In [None]:
#MOBILE 
book_genre_chart.properties(height=500, width=320, title='')

---

### Memior as share of all non-fiction books read

In [None]:
nonfiction = books[books['type'] == 'Non-Fiction'].groupby('subgenre').agg('size').reset_index(name='count')\
.sort_values(by='count', ascending=False)
nonfiction.head(20)

In [None]:
nonfiction['share'] = (nonfiction['count'] / 28) * 100

In [None]:
nonfiction.head()

### What share of the books Elise read were by female authors? 

In [None]:
book_gender = books.groupby(['gender_description', 'year'])\
.agg('size').reset_index(name='count').sort_values(by='count', ascending=False)

In [None]:
book_gender.head()

In [None]:
book_gender['share'] = (book_gender['count'] / 52)*100

### How does that gender breakdown look in a bar chart?

In [None]:
book_gender_chart = alt.Chart(book_gender).mark_bar().encode(
    x=alt.X('sum(count)', stack="normalize",  title='Share of books read by author gender',\
            axis=alt.Axis(format='%',tickCount=6)),
    y=alt.Y('year',title='',
            sort=alt.EncodingSortField(
            field="count:N",  # The field to use for the sort
            op="count",  # The operation to run on the field prior to sorting
            order="descending"  # The order to sort in)
            )),
    color=alt.Color('gender_description', legend=None)
)

book_gender_chart_text = book_gender_chart.mark_text(
    align='right',
    baseline='middle',
    dx=-10
).encode(text=alt.Text('gender_description'), 
    color=alt.condition(
        alt.datum.year == '2019',
        alt.value('white'),
        alt.value('white')
    )
)

(book_gender_chart + book_gender_chart_text).properties(height=30,width=700)

In [None]:
(book_gender_chart + book_gender_chart_text).properties(height=30,width=320)

---

### Page counts by subgenre

In [None]:
books.groupby(['subgenre']).agg({ 'pages': 'sum' }).reset_index()\
.sort_values(by='pages',ascending=False).head(10)

### Ratings by subgenre

In [None]:
books.groupby(['subgenre']).agg({ 'avg_goodreads_rating': 'mean' }).reset_index()\
.sort_values(by='avg_goodreads_rating',ascending=False).head(10)

### Favorite books by Goodreads rating

In [None]:
books_slim = books[['author', 'title', 'type', 'avg_goodreads_rating']][books['faves'] == 'Y']\
.sort_values(by='avg_goodreads_rating', ascending=False).head()

In [None]:
books_slim

In [None]:
book_ratings = books.groupby(['type']).agg({'avg_goodreads_rating': 'mean'}).reset_index()

In [None]:
book_ratings

---

### Export books list to CSV

In [None]:
books.to_csv('output/books.csv')