# Elise Hu: 2020 books

Quick analysis of the books she read last year.

### Load Python tools 

In [1]:
import pandas as pd
import geopandas as gpd
from urllib.request import urlopen 
import pyarrow
import jenkspy
import matplotlib.pyplot as plt
%matplotlib inline
import json
import numpy as np
from altair import datum
import altair as alt

In [2]:
alt.themes.enable('vox')

ThemeRegistry.enable('vox')

In [3]:
plt.rcParams['figure.figsize'] = (16,8)

In [4]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [5]:
pd.options.display.float_format = '{:,.2f}'.format

### Read book data from Elise

In [6]:
url = 'https://spreadsheets.google.com/feeds/download/spreadsheets/Export?key=1Y2lvWJE1pRjvG6e-mJcpXDeANh9EK0nWHJEQIoJrOHM&exportFormat=csv'

In [7]:
# src = pd.read_csv(url, parse_dates=True, infer_datetime_format=True)

In [8]:
src = pd.read_excel('input/elise-2020-books.xls')

In [9]:
src.dtypes

Order                            int64
Title                           object
Author                          object
Pages                          float64
URL                             object
gender                          object
type                            object
Subgenre                        object
Avg GoodReads Rating           float64
Started                 datetime64[ns]
date                    datetime64[ns]
Faves                           object
dtype: object

### Clean up field headers, data types

In [10]:
src.columns = src.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('-','_')

  """Entry point for launching an IPython kernel.


In [11]:
src['year'] = src['date'].dt.year.astype(str).str.replace('.0','',regex=False)

In [12]:
src['type'] = src['type'].str.replace(' ', '-')

In [13]:
src['days_to_read'] = src['date'] - src['started']

### Make gender more descriptive, calculate number of reading days

In [14]:
src['gender_description'] = src.gender.replace({'M': 'Male', 'F': 'Female'})

In [15]:
src['days_to_read'] = src['days_to_read'].dt.days

### Now start fresh with a dataframe called 'books'

In [16]:
books = pd.DataFrame(src)

### When did Elise finish her books?

In [17]:
# DESKTOP 
alt.Chart(books).mark_tick(thickness=2,size=20).encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('year',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' ')
).configure_axis(
    grid=False).properties(height=40, width=700)

In [18]:
# MOBILE 
alt.Chart(books).mark_tick(thickness=2,size=20).encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('year',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' ')
).configure_axis(
    grid=False).properties(height=40, width=320)

### Tinkering with more designs for books read by subgenre and month

In [19]:
#DESKTOP
alt.Chart(books).mark_tick(thickness=3,size=10).encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('subgenre',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' ')
).configure_axis(grid=False).properties(height=400, width=700)

In [20]:
# mobile
alt.Chart(books).mark_tick(thickness=2,size=10).encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('subgenre',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' ')
).configure_axis(grid=False).properties(height=400, width=320)

---

### Let's try books by author gender and month

In [21]:
#DESKTOP
alt.Chart(books).mark_tick(thickness=4,size=20).encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('gender_description',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' '),
    color=alt.Color('gender_description', legend=None)
).configure_axis(
    grid=False).properties(height=100, width=700)

In [22]:
# MOBILE
alt.Chart(books).mark_tick(thickness=2,size=20).encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('gender_description',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' '),
    color=alt.Color('gender_description', legend=None)
).configure_axis(
    grid=False).properties(height=100, width=320)

### Books by type and month

In [23]:
# DESKTOP 
alt.Chart(books).mark_tick(thickness=3,size=20).encode(
    x=alt.X('date',axis=alt.Axis(format='%B', tickColor='#ffffff', tickCount=7), title=''),
    y=alt.Y('type',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' '),
    color=alt.Color('type', legend=None)
).configure_axis(
    grid=False).properties(height=200, width=700, title='Books, by type and date finished')

In [24]:
# MOBILE
alt.Chart(books).mark_tick(thickness=2,size=20).encode(
    x=alt.X('date',axis=alt.Axis(format='%B', tickColor='#ffffff', tickCount=4), title=''),
    y=alt.Y('type',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' '),
    color=alt.Color('type', legend=None)
).configure_axis(
    grid=False).properties(height=200, width=320, title='Books, by type and date finished')

---

### Is there a strong relationship between the number of pages and days to read? No

In [25]:
# Remove handful of books that took months to finish
books_month = books[books['days_to_read'] < 30]

In [26]:
books_corr = books_month[['pages', 'days_to_read']]

In [27]:
corr = books_corr.corr(method ='pearson')

In [28]:
print(corr)

              pages  days_to_read
pages          1.00          0.28
days_to_read   0.28          1.00


In [29]:
#DESKTOP 
alt.Chart(books_month).mark_circle(size=60).encode(
    x=alt.X('days_to_read:Q', title='Days to read', axis=alt.Axis(tickCount=6)),
    y=alt.Y('pages:Q', title='Number of pages', axis=alt.Axis(tickCount=5)),
    tooltip=['title:N', 'author:O', 'avg_goodreads_rating:Q', 'pages:Q', 'days_to_read:Q']
).properties(width=500, height=500)

In [30]:
# MOBILE
alt.Chart(books_month).mark_circle(size=60).encode(
    x=alt.X('days_to_read:Q', title='Days to read', axis=alt.Axis(tickCount=6)),
    y=alt.Y('pages:Q', title='Number of pages', axis=alt.Axis(tickCount=5)),
    tooltip=['title:N', 'author:O', 'avg_goodreads_rating:Q', 'pages:Q', 'days_to_read:Q']
).properties(width=320, height=320)

---

### Which types of books did Elise read most?

In [31]:
book_type = books.groupby(['type', 'year']).agg('size').reset_index(name='count').sort_values(by='count', ascending=False)

In [32]:
book_type.head()

Unnamed: 0,type,year,count
0,Fiction,2020,20
1,Non-Fiction,2020,18
2,Poetry,2020,1


In [33]:
book_type['share'] = (book_type['count'] / 52)*100

In [34]:
book_type.head()

Unnamed: 0,type,year,count,share
0,Fiction,2020,20,38.46
1,Non-Fiction,2020,18,34.62
2,Poetry,2020,1,1.92


In [35]:
book_type_chart = alt.Chart(book_type).mark_bar().encode(
    x=alt.X('sum(count)', stack="normalize",  title='Share of books read by genre', \
            axis=alt.Axis(format='%',tickCount=6)),
    y=alt.Y('year',title=''),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'type',
      sort='ascending'
    ),
    color=alt.Color('type', legend=alt.Legend(orient="top", title='Book type'))
)

(book_type_chart).properties(height=40,width=700)

In [36]:
(book_type_chart).properties(height=40,width=320)

---

### Which genre of books did Elise read most?

In [37]:
book_genre = books.groupby('subgenre').agg('size').reset_index(name='count')\
.sort_values(by='count', ascending=False)
book_genre.head()

Unnamed: 0,subgenre,count
2,Contemporary,10
4,Essays,4
9,Psychology,3
12,Short Stories,3
0,Business,2


### Books read by sub-genre

In [38]:
#DESKTOP
book_genre_chart = alt.Chart(book_genre).mark_bar().encode(
    y=alt.Y("subgenre:N", title=' ',
           sort=alt.EncodingSortField(
            field="count:Q",  # The field to use for the sort
            op="sum",  # The operation to run on the field prior to sorting
            order="ascending"  # The order to sort in
        )),
    x=alt.X("count:Q", title=' ', axis=alt.Axis(tickCount=6, format=''))
)

book_genre_chart.properties(height=500, width=700, title='')

In [39]:
#MOBILE 
book_genre_chart.properties(height=500, width=320, title='')

---

### Memior as share of all non-fiction books read

In [40]:
nonfiction = books[books['type'] == 'Non-Fiction'].groupby('subgenre').agg('size').reset_index(name='count')\
.sort_values(by='count', ascending=False)
nonfiction.head(20)

Unnamed: 0,subgenre,count
2,Essays,4
6,Psychology,3
0,Business,2
1,Current Events,2
7,Self Help,2
3,History,1
4,Memoir,1
5,Poetry,1
8,Spirituality,1
9,Writing,1


In [41]:
nonfiction['share'] = (nonfiction['count'] / 28) * 100

In [42]:
nonfiction.head()

Unnamed: 0,subgenre,count,share
2,Essays,4,14.29
6,Psychology,3,10.71
0,Business,2,7.14
1,Current Events,2,7.14
7,Self Help,2,7.14


### What share of the books Elise read were by female authors? 

In [43]:
book_gender = books.groupby(['gender_description', 'year'])\
.agg('size').reset_index(name='count').sort_values(by='count', ascending=False)

In [44]:
book_gender.head()

Unnamed: 0,gender_description,year,count
0,Female,2020,24
1,Male,2020,15


In [45]:
book_gender['share'] = (book_gender['count'] / 52)*100

### How does that gender breakdown look in a bar chart?

In [46]:
book_gender_chart = alt.Chart(book_gender).mark_bar().encode(
    x=alt.X('sum(count)', stack="normalize",  title='Share of books read by author gender',\
            axis=alt.Axis(format='%',tickCount=6)),
    y=alt.Y('year',title='',
            sort=alt.EncodingSortField(
            field="count:N",  # The field to use for the sort
            op="count",  # The operation to run on the field prior to sorting
            order="descending"  # The order to sort in)
            )),
    color=alt.Color('gender_description', legend=None)
)

book_gender_chart_text = book_gender_chart.mark_text(
    align='right',
    baseline='middle',
    dx=-10
).encode(text=alt.Text('gender_description'), 
    color=alt.condition(
        alt.datum.year == '2019',
        alt.value('white'),
        alt.value('white')
    )
)

(book_gender_chart + book_gender_chart_text).properties(height=30,width=700)

In [47]:
(book_gender_chart + book_gender_chart_text).properties(height=30,width=320)

---

### Page counts by subgenre

In [48]:
books.groupby(['subgenre']).agg({ 'pages': 'sum' }).reset_index()\
.sort_values(by='pages',ascending=False).head(10)

Unnamed: 0,subgenre,pages
2,Contemporary,2512.0
4,Essays,928.0
16,historical,711.0
15,Young Adult,690.0
12,Short Stories,578.0
9,Psychology,575.0
0,Business,553.0
11,Self Help,488.0
3,Current Events,456.0
10,Romance,421.0


### Ratings by subgenre

In [49]:
books.groupby(['subgenre']).agg({ 'avg_goodreads_rating': 'mean' }).reset_index()\
.sort_values(by='avg_goodreads_rating',ascending=False).head(10)

Unnamed: 0,subgenre,avg_goodreads_rating
6,LGBT,4.52
7,Memoir,4.46
3,Current Events,4.38
8,Poetry,4.26
14,Writing,4.23
9,Psychology,4.23
0,Business,4.17
12,Short Stories,4.17
4,Essays,4.16
13,Spirituality,4.15


### Favorite books by Goodreads rating

In [50]:
books_slim = books[['author', 'title', 'type', 'avg_goodreads_rating']][books['faves'] == 'Y']\
.sort_values(by='avg_goodreads_rating', ascending=False).head()

In [51]:
books_slim

Unnamed: 0,author,title,type,avg_goodreads_rating
19,Tanehisi Coates,Between the World and Me,Non-Fiction,4.52
29,Danez Smith,Don't Call us Dead: Poems,Poetry,4.52
23,Audre Lorde,The Masters Tools Will Never DIsmantle The Mas...,Non-Fiction,4.46
35,James Hollis,The Middle Passage,Non-Fiction,4.44
11,Anne Lamott,Bird by Bird: Some Instructions on Writing and...,Non-Fiction,4.23


In [52]:
book_ratings = books.groupby(['type']).agg({'avg_goodreads_rating': 'mean'}).reset_index()

In [53]:
book_ratings

Unnamed: 0,type,avg_goodreads_rating
0,Fiction,3.77
1,Non-Fiction,4.21
2,Poetry,4.52


---

### Export books list to CSV

In [54]:
books.to_csv('output/books.csv')