# Histograms

In [None]:
from datascience import *
import numpy as np
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
plots.rcParams["patch.force_edgecolor"] = True

## Categorical Distribution ##

In [None]:
# https://www.pewresearch.org/internet/2020/07/28/parenting-children-in-the-age-of-screens/
# "I am ____ confident in my ability to know how much screen time is appropriate for my child."

parent_responses = Table().with_columns(
    'How confident', make_array('Not too / not at all', 'Somewhat', 'Very'),
    'Percent', make_array(16, 45, 39)
)
parent_responses

In [None]:
sum(parent_responses.column('Percent'))

In [None]:
parent_responses.barh('How confident')

## Categorical Distribution: Finding the Counts ##

In [None]:
top_movies = Table.read_table('data/top_movies_2017.csv')
top_movies

In [None]:
studios = top_movies.select('Studio')
studios.show(4)

In [None]:
studio_distribution = studios.group('Studio')
studio_distribution

In [None]:
sum(studio_distribution.column('count'))

In [None]:
studio_distribution.barh('Studio')

In [None]:
studio_distribution.sort('count', descending=True).barh('Studio')

## Distribution of a Quantitative Variable ##

In [None]:
ages = 2021 - top_movies.column('Year')
top_movies = top_movies.with_column('Age', ages)

In [None]:
top_movies

## Finding the Counts: Binning ##

In [None]:
min(ages), max(ages)

In [None]:
equal_bins = top_movies.bin('Age', bins = np.arange(0, 101, 20))
equal_bins

In [None]:
sum(equal_bins.column('Age count'))

In [None]:
# All bins except the last are of the form [a, b)
top_movies.where(
    'Age', are.above_or_equal_to(40)).where(
    'Age', are.below(60)).num_rows

In [None]:
top_movies.where('Age', 100)

In [None]:
# The last bin is NOT [80, 100)
top_movies.where(
    'Age', are.above_or_equal_to(80)).where(
    'Age', are.below(100)).num_rows

In [None]:
# Instead, it is [80, 100]
top_movies.where(
    'Age', are.above_or_equal_to(80)).where(
    'Age', are.below_or_equal_to(100)).num_rows

In [None]:
my_bins = make_array(0, 10, 20, 40, 60, 100)

In [None]:
binned_data = top_movies.bin('Age', bins = my_bins)
binned_data

In [None]:
sum(binned_data.column('Age count'))

In [None]:
# The last bin is [60, 100]
top_movies.where(
    'Age', are.above_or_equal_to(60)).where(
    'Age', are.below_or_equal_to(100)).num_rows

## Histogram: Percent = Area ##

In [None]:
# Not specifying any bins
top_movies.hist('Age', unit='Year')

In [None]:
# Equally spaced bins of width 20
top_movies.hist('Age', bins = np.arange(0, 101, 20), unit = 'Year')

In [None]:
# Percent in [20, 40) bin = height * width

1.5 * (40 - 20)

In [None]:
equal_bins

In [None]:
total_count = sum(equal_bins.column('Age count'))
60/total_count

## The Vertical Axis ##

In [None]:
my_bins

In [None]:
binned_data

In [None]:
top_movies.hist('Age', bins = my_bins, unit = 'Year')

In [None]:
"""Plotting the counts on the vertical axis is a bad idea."""
top_movies.hist('Age', bins = my_bins, unit = 'Year', ...)

In [None]:
top_movies.hist('Age', bins = my_bins, unit = 'Year')

In [None]:
# Add a column containing what percent of movies are in each bin
binned_data = binned_data.with_column(
    'Percent', 100*binned_data.column('Age count')/200)

In [None]:
binned_data

## Finding Height ##

### Height of the [40, 60) bar

In [None]:
# Height of [40, 60) bar

percent_in_bin = 23
width_of_bin = 60 - 40
height_of_bar = percent_in_bin / width_of_bin
height_of_bar

### Heights of all the bars

In [None]:
# Get the bin left ends
bin_lefts = binned_data.take(np.arange(binned_data.num_rows - 1))
bin_lefts

In [None]:
# Get the bin widths
bin_widths = np.diff(binned_data.column('bin'))
bin_lefts_and_widths = bin_lefts.with_column('Width', bin_widths)
bin_lefts_and_widths

In [None]:
# Get the bin heights
bin_heights = bin_lefts.column('Percent') / bin_widths
all_histogram_data = bin_lefts_and_widths.with_column('Height', bin_heights)
all_histogram_data

In [None]:
top_movies.hist('Age', bins = my_bins, unit = 'Year')