# Elise Hu: 2019 books

### Load Python tools 

In [1]:
import pandas as pd
import geopandas as gpd
from urllib.request import urlopen 
import pyarrow
import jenkspy
import matplotlib.pyplot as plt
%matplotlib inline
import json
import numpy as np
from altair import datum
import altair as alt
import altair_latimes as lat

In [2]:
alt.renderers.enable('notebook')
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [3]:
plt.rcParams['figure.figsize'] = (16,8)
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
pd.options.display.float_format = '{:,.2f}'.format
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [4]:
src = pd.read_excel('input/elise-2019-books.xlsx')

In [None]:
src.dtypes

Order                            int64
Title                           object
Author                          object
Pages                            int64
URL                             object
gender                          object
type                            object
Subgenre                        object
Avg GoodReads Rating           float64
Started                 datetime64[ns]
date                    datetime64[ns]
Faves                           object
dtype: object

In [None]:
src.columns = src.columns.str.strip().str.lower().str.replace(' ', '_')\
                    .str.replace('(', '').str.replace(')', '').str.replace('-','_')

In [None]:
src['year'] = src['date'].dt.year.astype(str).str.replace('.0','',regex=False)

In [None]:
src['type'] = src['type'].str.replace(' ', '-')

In [None]:
src['days_to_read'] = src['date'] - src['started']

In [None]:
src['gender_description'] = src.gender.replace({'M': 'Male', 'F': 'Female'})

In [None]:
src.dtypes

In [None]:
src['days_to_read'] = src['days_to_read'].dt.days

In [None]:
alt.Chart(src).mark_tick().encode(
    x=alt.X('date',axis=alt.Axis(tickCount=12, format='%b', tickColor='#ffffff'), title=''),
    y=alt.Y('gender_description',axis=alt.Axis(tickCount=0, tickColor='#ffffff'), title=' '),
    color=alt.Color('gender_description', legend=None)
).configure_axis(
    grid=False).properties(height=100, width=320)

In [None]:
books = pd.DataFrame(src)

---

### No relationship between Goodreads score and time it took to read

In [None]:
books_quick = books[ (books['days_to_read'] < 30) & (books['avg_goodreads_rating'] > 2) ]

In [None]:
books_corr = books_quick[['avg_goodreads_rating', 'days_to_read']]

In [None]:
corr = books_corr.corr(method ='pearson')

In [None]:
print(corr)

In [None]:
alt.Chart(books_quick).mark_circle(size=60).encode(
    x=alt.X('avg_goodreads_rating:Q', title='Avg goodreads rating', axis=alt.Axis(tickCount=6), scale=alt.Scale(domain=(3, 5))),
    y=alt.Y('days_to_read:Q', title='Days to read', axis=alt.Axis(tickCount=5)),
    tooltip=['title:N', 'author:O', 'avg_goodreads_rating:Q', 'days_to_read:Q']
).properties(width=500, height=500)

---

In [None]:
books.head()

In [None]:
book_type = books.groupby(['type', 'year']).agg('size').reset_index(name='count').sort_values(by='count', ascending=False)
book_type.head()

In [None]:
book_type_chart = alt.Chart(book_type).mark_bar().encode(
    x=alt.X('sum(count)', stack="normalize",  title='Share of books read by genre', \
            axis=alt.Axis(format='%',tickCount=6)),
    y=alt.Y('year',title=''),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'type',
      sort='ascending'
    ),
    color=alt.Color('type', legend=alt.Legend(orient="top", title='Book type'))
)

(book_type_chart).properties(height=30,width=700)

In [None]:
book_genre = books.groupby('subgenre').agg('size').reset_index(name='count')\
.sort_values(by='count', ascending=False)
book_genre.head()

In [None]:
alt.Chart(book_genre).mark_bar().encode(
    y=alt.Y("subgenre:N", title=' ',
           sort=alt.EncodingSortField(
            field="count:Q",  # The field to use for the sort
            op="sum",  # The operation to run on the field prior to sorting
            order="ascending"  # The order to sort in
        )),
    x=alt.X("count:Q", title=' ', axis=alt.Axis(tickCount=6, format=''))
).properties(height=500, width=700, title='Books read, by subgenre')

---

In [None]:
book_gender = books.groupby(['gender_description', 'year']).agg('size').reset_index(name='count').sort_values(by='count', ascending=False)
book_gender.head()

In [None]:
book_gender_chart = alt.Chart(book_gender).mark_bar().encode(
    x=alt.X('sum(count)', stack="normalize",  title='Share of books read by author gender',\
            axis=alt.Axis(format='%',tickCount=6)),
    y=alt.Y('year',title='',
            sort=alt.EncodingSortField(
            field="count:N",  # The field to use for the sort
            op="count",  # The operation to run on the field prior to sorting
            order="descending"  # The order to sort in)
            )),
    color=alt.Color('gender_description', legend=None)
)

book_gender_chart_text = book_gender_chart.mark_text(
    align='right',
    baseline='middle',
    dx=-10
).encode(text=alt.Text('gender_description'), 
    color=alt.condition(
        alt.datum.year == '2019',
        alt.value('white'),
        alt.value('white')
    )
)


(book_gender_chart + book_gender_chart_text).properties(height=30,width=700)

---

### Ratings by subgenre

In [None]:
books.groupby(['subgenre', 'gender_description']).agg({ 'avg_goodreads_rating': 'mean' }).reset_index()\
.sort_values(by='avg_goodreads_rating',ascending=False)