Spring 2025 <br>
Lecture 04

# Comparing Categories (Part 1)
Using vertical and horizontal bar plots

Bar graph == bar chart == bar plot

In [77]:
# Installs

# ! conda install plotly==6.0.0 -y

In [78]:
# Imports
import pandas as pd
import plotly.express as px
from great_tables import (
    GT, md, google_font, style, loc # use fonts.google.com to select fonts
)

# Disable scientific notation
pd.set_option('display.float.format', lambda x: '%.2f' % x)

# Load data
df = pd.read_csv('data/top-500-novels-metadata_2025-01-11.csv')

## Examples

1. Data manipulations:

    - Which author(s) have the most total ratings across their novels? Return the top 10.
    - Which author(s) with at least 5 books, have the most total ratings per book across their novels?
    - Which author(s) have the highest average rating across their novels? Return the top 20 authors & round the ratings to 1 decimal point.

2. Create a table and heatmap for 1 of the above examples.

3. Create a vertical bar graph for 1 of these examples.

4. Create a horizontal bar graph for 1 of these examples.



### Example 1

In [79]:
df.columns

Index(['top_500_rank', 'title', 'author', 'pub_year', 'orig_lang', 'genre',
       'author_birth', 'author_death', 'author_gender', 'author_primary_lang',
       'author_nationality', 'author_field_of_activity', 'author_occupation',
       'oclc_holdings', 'oclc_eholdings', 'oclc_total_editions',
       'oclc_holdings_rank', 'oclc_editions_rank', 'gr_avg_rating',
       'gr_num_ratings', 'gr_num_reviews', 'gr_avg_rating_rank',
       'gr_num_ratings_rank', 'oclc_owi', 'author_viaf', 'gr_url', 'wiki_url',
       'pg_eng_url', 'pg_orig_url'],
      dtype='object')

#### New Data Manipulation: Aggregation by Groups

List of aggregation function names
- 'sum' - Sum
- 'median' - Median
- 'mean' - Mean
- 'nunique' - Number of unique (how many times each value appears)

In [80]:
# Bullet 1

# grouping variable ---> 'author'
# aggregation (function) ---> 'sum'
# aggregation variable

(
    df
    .groupby(['author']) # grouping variable
    .agg(
        {
            # Aggregation
            # 'aggregation_variable' : 'aggregation_function'
            'gr_num_ratings': 'sum'
        }
    )
    # by default, the grouping variable becomes the index
    # so, reset it, to undo this
    .reset_index()
    # sort by the number of ratings (descending)
    .sort_values('gr_num_ratings', ascending=False)
    # return the top 10
    .head(10)
    # rename
    .rename(
        columns={
            'author': 'Author Name',
            'gr_num_ratings': 'Total Rating'
        }
    )
    # reset index ---> to reorder the index in the order of the new table
    .reset_index(drop=True)
)

Unnamed: 0,Author Name,Total Rating
0,J.K. Rowling,31983168
1,Suzanne Collins,15352583
2,J.R.R. Tolkien,8884254
3,George Orwell,8411378
4,Stephenie Meyer,8233531
5,John Green,8046410
6,Dan Brown,7340391
7,Jane Austen,6867180
8,Harper Lee,6353584
9,C.S. Lewis,5583158


In [81]:
# Bullet 2

# Which author(s) with at least 5 books, have the most total ratings per book across their novels?

# "With at least 5 books" -- Filter
# "Authors with the most total ratings per book" -- Groupby + Aggregation
# "Total ratings per book" ---- number of ratings / number of books

# How many books and how many total ratings does each author have?

# Grouping Variable --- 'author'
# Aggregation Variable #1 --- gr_num_ratings  & Aggregation Function #1 --- 'sum'
# Aggregation Variable #2 --- title & Aggregation Function #2 --- 'nunique'

df_ex1b2 = (
    df
    .groupby(['author'])
    .agg(
        {
            'gr_num_ratings': 'sum',
            'title': 'nunique'
        }
    )
    .reset_index()
    # Filter ---> Title >= 5
    [lambda x: x['title'] >= 5] # Replacing DF, with "lambda x: x" ---> do this filter on the previous dataset
    # Create a variable ----> total number of ratings per book ---> gr_num_ratings / title
)

# df_ex1b2['ratings_per_book'] = df_ex1b2['gr_num_ratings'] / df_ex1b2['title']
# OR

(
    df
    .groupby(['author'])
    .agg(
        {
            'gr_num_ratings': 'sum',
            'title': 'nunique'
        }
    )
    .reset_index()
    # Filter ---> Title >= 5
    [lambda x: x['title']>=5] # Replacing DF, with "lambda x: x" ---> do this filter on the previous dataset
    # Create a variable ----> total number of ratings per book ---> gr_num_ratings / title
    .assign(
        ratings_per_book = lambda x: x['gr_num_ratings']/x['title']
    )
    # Sort by our new column!
    .sort_values(by = 'ratings_per_book', ascending = False)
    # Only select the most relevant columns
    [
        [
            'author',
            'title',
            'ratings_per_book'
        ]
    ]
    # Rename the columns to be more descriptive
    .rename(
        columns = {
            'author': 'Author Name',
            'ratings_per_book': 'Ratings Count Per Book',
            'title': 'Number of Books'
        }
    )
    .reset_index(drop = True)
)


Unnamed: 0,Author Name,Number of Books,Ratings Count Per Book
0,J.K. Rowling,7,4569024.0
1,J.R.R. Tolkien,5,1776850.8
2,Dan Brown,5,1468078.2
3,Jane Austen,5,1373436.0
4,Stephen King,7,790065.29
5,C.S. Lewis,8,697894.75
6,Nicholas Sparks,7,565527.57
7,John Steinbeck,8,562282.88
8,Ernest Hemingway,5,445340.4
9,Charles Dickens,15,239534.0


In [82]:
# Bullet 3
#     Which author(s) have the highest average rating across their novels? Return the top 20 authors & round the ratings to 1 decimal point.

# Grouping Variable --> author
# Aggregation Function ---> mean
# Aggregation Variable -- gr_avg_rating

df_average_author_ratings = (
    df
    .groupby('author')
    .agg(
        {
            'gr_avg_rating': 'mean',

        }
    )
    .reset_index()
    .sort_values(by = 'gr_avg_rating', ascending=False)
    .head(20)
    .rename(
        columns={
            'author': 'Author Name',
            'gr_avg_rating': 'Average Rating'
        }
    )
    # round to 1 decimal point
    .round(1)
    # reset for GT usage
    .reset_index(drop = True)
)

display(df_average_author_ratings)

Unnamed: 0,Author Name,Average Rating
0,J.K. Rowling,4.5
1,Kathryn Stockett,4.5
2,Alex Haley,4.4
3,George R.R. Martin,4.4
4,Markus Zusak,4.4
5,Khaled Hosseini,4.4
6,J.R.R. Tolkien,4.3
7,Ken Follett,4.3
8,Thomas Keneally,4.3
9,Michael Shaara,4.3


### Example 2

Table & Heatmap

In [83]:
(
    GT(df_average_author_ratings)
    # Add title and subtitle
    .tab_header(
        title = md(
            '**Best authors have similar average ratings**'
        ),
        subtitle = md(
            'The Top 20 best have authors all have mean Goodreads ratings within<br>0.2 of each other across their entire corpuses.'
        )
    )
    # Left align
    .opt_align_table_header('left')
    # Title font
    .tab_style(
        style=style.text(font=google_font(name = 'Times New Roman')),
        locations=loc.title()
    )
    # Table font
    .opt_table_font(google_font(name = 'Helvetica'))
    # Additional Tweaks
    .tab_options(
        # Set font sizes
        heading_title_font_size='24px',
        heading_subtitle_font_size='14px',
        column_labels_font_size='14px',
        table_font_size='16px',
        # Row padding
        data_row_padding=0.75
    )
    # Heatmap
    .data_color(
        domain=[4,5],
        columns = 'Average Rating',
        palette='Greys'
    )
)

Best authors have similar average ratings,Best authors have similar average ratings
The Top 20 best have authors all have mean Goodreads ratings within 0.2 of each other across their entire corpuses.,The Top 20 best have authors all have mean Goodreads ratings within 0.2 of each other across their entire corpuses.
Author Name,Average Rating
J.K. Rowling,4.5
Kathryn Stockett,4.5
Alex Haley,4.4
George R.R. Martin,4.4
Markus Zusak,4.4
Khaled Hosseini,4.4
J.R.R. Tolkien,4.3
Ken Follett,4.3
Thomas Keneally,4.3
Michael Shaara,4.3


# Example 3

Vertical Bar Plot -----> Bars are vertical 

In [113]:
# minimum needed 3 arguments - dataset, x, y

# plotly uses html only --> <>

# <b> ----Bold   ---- <b>This text is bold!</b>
# <i> ----Italics

px.bar(
    df_average_author_ratings,
    x = 'Author Name',
    y = 'Average Rating',
    title = '<b>Best authors have similar ratings</b>',
    subtitle = 'The Top 20 best have authors all have mean Goodreads ratings within 0.2 of each other across their entire corpuses.',
    height = 650,
    width = 1000,
    # Bar color
    color_discrete_sequence=['cornflowerblue'],
    # Template
    template='simple_white',
    # Range
    range_y = [0, 5]
)

# Example 4

Horizontal Bar Plot -----> Bars are horizontal 

In [114]:
px.bar(
    df_average_author_ratings.sort_values(by = 'Average Rating', ascending=True), # resort for horizontal bar plot
    x = 'Average Rating',
    y = 'Author Name',
    title = '<b>Best authors have similar ratings</b>',
    subtitle = 'The Top 20 best have authors all have mean Goodreads ratings within 0.2 of each other across their entire corpuses.',
    # We want each author name i.e. each bar label to be visible
    height = 650,
    # We don't need all of that horizontal space
    width = 500,
    color_discrete_sequence=['lightgray'],
    template='simple_white',
    # Range
    range_x = [0, 5]
)