In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, date, timedelta
import numpy as np

In [None]:
juvenile_df = pd.read_csv('../Data/juvenile_clean.csv', index_col = 0, dtype={5: str})

In [None]:
juvenile_df['CheckoutDate'] = pd.to_datetime(juvenile_df['CheckoutDate'])

## General Data Description

In [None]:
juvenile_df.info()

In [None]:
juvenile_df['UsageClass'].value_counts(dropna=False)

In [None]:
juvenile_df['MaterialCategory'].value_counts()

In [None]:
juvenile_df['CheckoutDate'].describe()

## Identifying Popular Books

In [None]:
books_df = juvenile_df.loc[juvenile_df.MaterialCategory.isin(['Book', 'E-book', 'Audiobook'])].reset_index()

In [None]:
books_df[['Title', 'CreatorName']].drop_duplicates().shape[0]

In [None]:
books_df['CheckoutDate'].describe()

### Overall top books

In [None]:
top10_overall = books_df.groupby(['CreatorName', 'Title'], as_index = False).agg(Checkouts = ('Checkouts', 'sum'), Created = ('CreatedDate', 'min')).sort_values('Checkouts', ascending = False).head(10).rename(columns = {'Checkouts': 'TotalCheckouts'})

In [None]:
top10_overall = pd.merge(top10_overall, books_df.groupby(['CreatorName', 'Title', 'UsageClass'], as_index = False)['Checkouts'].sum(), how = 'inner', on = ['CreatorName', 'Title']).pivot(index = ['CreatorName', 'Title', 'Created', 'TotalCheckouts'], columns = 'UsageClass', values = 'Checkouts').reset_index().sort_values('TotalCheckouts', ascending = False).reset_index(drop = True)

In [None]:
top10_overall

In [None]:
## Get data to graph
top10_OT = pd.merge(juvenile_df, top10_overall, how = 'inner', on = ['CreatorName', 'Title'])
colors = dict(zip(top10_overall.CreatorName.unique(), ['MediumVioletRed', 'DarkRed', 'CornflowerBlue', 'Navy', 'SeaGreen', 'Orange']))

## Build the graph
fig, ax = plt.subplots(figsize = (10,4), layout = 'constrained')

for i, r in top10_overall.iterrows():
    title = r['Title']
    author = r['CreatorName']

    filtered = top10_OT.loc[top10_OT['Title'] == title].groupby(['Title', 'CreatorName', 'CheckoutDate'], as_index = False)['Checkouts'].sum()
    plt.plot(filtered['CheckoutDate'], filtered['Checkouts'], 
             color = colors.get(author), label = title, alpha = .7)

    plt.text(s = '#' + str(i+1) + ': ' + title + ' - ' + author, x = filtered['CheckoutDate'].max() + timedelta(days = 50), 
             y = 550 - (i*60) ##filtered.loc[filtered['CheckoutDate'] == filtered['CheckoutDate'].max(), 'Checkouts']
             , color = colors.get(author))


## Graph title
fig.suptitle('Top 10 Most Popular Books', size = 'x-large', color = 'black')

## axis customization:
ax.tick_params(axis = 'x', colors = 'darkslategray', labelrotation = 0)
ax.spines[['top', 'right']].set_visible(False)
ax.spines[['left', 'bottom']].set_color('darkslategray')
ax.tick_params(axis = 'y', colors = 'darkslategray', labelrotation = 0)
ax.tick_params(axis = 'x', colors = 'darkslategray')
ax.set_ylabel('Monthly Checkouts', color = 'darkslategray')

##plt.savefig('', transparent = True) 

### Top avg checkouts

### Top monthly checkout numbers

In [None]:
top_monthly_checkouts = books_df.groupby(['CreatorName', 'Title', 'CheckoutDate'], as_index = False)['Checkouts'].sum().sort_values('Checkouts', ascending = False).rename(columns = {'Checkouts': 'MonthlyCheckouts'}).head(10)

In [None]:
top_monthly_checkouts = pd.merge(top_monthly_checkouts, books_df.groupby(['CreatorName', 'Title', 'CheckoutDate', 'UsageClass'], as_index = False)['Checkouts'].sum(), how = 'inner', on = ['CreatorName', 'Title', 'CheckoutDate']).pivot(index = ['CreatorName', 'Title', 'CheckoutDate', 'MonthlyCheckouts'], columns = 'UsageClass', values = 'Checkouts').reset_index().sort_values('MonthlyCheckouts', ascending = False).reset_index(drop = True)

In [None]:
top_monthly_checkouts

### Authors with most monthly checkouts

In [None]:
top_author_checkouts = books_df.groupby(['CreatorName', 'CheckoutDate'], as_index = False).agg(Checkouts = ('Checkouts', 'sum'), Titles = ('Title', 'nunique')).sort_values('Checkouts', ascending = False).rename(columns = {'Checkouts': 'MonthlyCheckouts'}).head(10)

In [None]:
top_author_checkouts = pd.merge(top_author_checkouts, books_df.groupby(['CreatorName', 'CheckoutDate', 'UsageClass'], as_index = False)['Checkouts'].sum(), how = 'inner', on = ['CreatorName', 'CheckoutDate']).pivot(index = ['CreatorName', 'CheckoutDate', 'Titles', 'MonthlyCheckouts'], columns = 'UsageClass', values = 'Checkouts').reset_index().sort_values('MonthlyCheckouts', ascending = False).reset_index(drop = True)

In [None]:
top_author_checkouts

### Top books each year

In [None]:
books_df['CheckoutYear'] = books_df.CheckoutDate.dt.year

In [None]:
top5_per_year = pd.DataFrame()

for year in books_df.CheckoutYear.unique():
    ## check number of months with data for that year, and only compare complete years
    months = books_df.loc[books_df['CheckoutYear'] == year, 'CheckoutDate'].unique()
    if len(months) == 12:
        df = books_df.loc[books_df['CheckoutYear'] == year].groupby(['CheckoutYear', 'CreatorName', 'Title'], as_index = False)['Checkouts'].sum().sort_values('Checkouts', ascending = False).head(5)
        df['Rank'] = [1, 2, 3, 4, 5]
        top5_per_year = pd.concat([top5_per_year, df])

In [None]:
top5_per_year = top5_per_year.sort_values(['CheckoutYear', 'Checkouts'], ascending = [True, False])

In [None]:
pd.set_option('display.max_rows', None)
top5_per_year

### Top authors per year

In [None]:
top5_authors_py = pd.DataFrame()

for year in books_df.CheckoutYear.unique():
    ## check number of months with data for that year, and only compare complete years
    months = books_df.loc[books_df['CheckoutYear'] == year, 'CheckoutDate'].unique()
    if len(months) == 12:
        df = books_df.loc[books_df['CheckoutYear'] == year].groupby(['CheckoutYear', 'CreatorName'], as_index = False).agg(Checkouts = ('Checkouts', 'sum'), Titles = ('Title', 'nunique')).sort_values('Checkouts', ascending = False).head(5)
        df['Rank'] = [1, 2, 3, 4, 5]
        top5_authors_py = pd.concat([top5_authors_py, df])

In [None]:
top5_authors_py = top5_authors_py.sort_values(['CheckoutYear', 'Checkouts'], ascending = [True, False])

In [None]:
top5_authors_py

In [None]:
pd.reset_option('display.max_rows')

### Consistancy

#### Which titles had the biggest spikes in popularity?

In [None]:
def q25(x):
    return x.quantile(.25)

In [None]:
def q75(x):
    return x.quantile(.75)

In [None]:
books_spread = books_df.groupby(['Title', 'CreatorName'], as_index = False)['Checkouts'].agg(['count', 'mean', 'min', q25, 'median', q75, 'max'])

In [None]:
books_spread['range'] = books_spread['max'] - books_spread['min']

In [None]:
books_spread['IQR'] = books_spread['q75'] - books_spread['q25']

In [None]:
books_spread = books_spread.rename(columns = {'count': 'months'})

In [None]:
books_spread.sort_values('range', ascending = False).head(10)

#### Which titles had the most consistent popularity?

What does it mean to have consistent popularity?  
* Small IQR (the book is usually checked out the same number of times each month)

But that only finds *consistency*, we also need to find *popularity*
* High checkout numbers

If we do max div by IQR, bigger number is better?

In [None]:
books_spread['meas'] = books_spread['range'] / (books_spread['IQR'] + 1)

In [None]:
books_spread.loc[(books_spread.months >= 12) & (books_spread.meas > 0)].sort_values('meas', ascending = True)

In [None]:
books_spread.loc[books_spread.CreatorName == 'J. K. Rowling']

In [None]:
books_spread['upper_bound'] = books_spread['q75'] + (1.5 * books_spread['IQR'])

In [None]:
books_spread['lower_bound'] = books_spread['q25'] - (1.5 * books_spread['IQR'])

Other questions to explore:
* How does releasing a new book impact an author's popularity in the next 12 months? 60 months?
* 