In [None]:
!pip install functools'

In [None]:
!pip install pandas_datareader

In [None]:
pip install altair vega notebook vega_datasets

In [None]:
from functools import reduce
import re 
import pandas as pd
import numpy as np 
import seaborn as sns
import requests 
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import datetime
from pandas_datareader import data
import matplotlib.pyplot as plt 
import json
from IPython.core.display import display, HTML
import altair as alt

In [None]:
data = pd.read_csv('data/gapminder.tsv', sep = '\t')
movies = pd.read_csv('data/movies.csv')

In [None]:
#30. Introduction to Altair 

In [None]:
df = pd.DataFrame({
    'city': ['Seattle', 'Seattle', 'Seattle', 'New York', 'New York', 'New York', 'Chicago', 'Chicago', 'Chicago'],
    'month': ['Apr', 'Aug', 'Dec', 'Apr', 'Aug', 'Dec', 'Apr', 'Aug', 'Dec'],
    'precip': [2.68, 0.87, 5.31, 3.94, 4.13, 3.58, 3.62, 3.98, 2.56]
})

df

In [None]:
# the chart object 

chart = alt.Chart(df)
chart

In [None]:
# marks and encoding 

ch2 = chart.mark_rect()
ch2

In [None]:
alt.Chart(df).mark_point().encode(
    y = 'city', )

In [None]:
alt.Chart(df).mark_point().encode(
    x = 'city',
    y = 'precip')

In [None]:
alt.Chart(df).mark_point().encode(
    alt.X('precip'),
    alt.Y('city'))

In [None]:
# data aggregation

alt.Chart(df).mark_point().encode(
    alt.X('average(precip)'),
    alt.Y('city'))

In [None]:
df2 = df.groupby('city').mean().reset_index()
alt.Chart(df2).mark_point().encode(
    alt.X('precip'),
    alt.Y('city'))

In [None]:
# changing the mark type 

alt.Chart(df).mark_bar().encode(
    alt.X('average(precip)'),
    alt.Y('city'))

In [None]:
alt.Chart(df).mark_bar().encode(
    alt.X('city'),
    alt.Y('average(precip)'))

In [None]:
# combining charts 

chart1 = alt.Chart(df).mark_line().encode(
            alt.X('month'),
            alt.Y('average(precip)'),
            color = 'city')
chart1

In [None]:
chart2 = alt.Chart(df).mark_point().encode(
            alt.X('month'),
            alt.Y('average(precip)'),
            color = 'city')
chart2

In [None]:
chart3 = chart1 | chart2
chart3

In [None]:
chart4 = chart1 & chart2 
chart4

In [None]:
chart5 = chart1 + chart2 
chart5 

In [None]:
chart5 | chart4

In [None]:
chart5.interactive()

In [None]:
chart5.save('chart.html')

In [None]:
# 31. Data Typesm Graphical Marks, and Visual Encoding Channels 

In [None]:
data = pd.read_csv('data/gapminder.tsv', sep = '\t')
data.head(5)

In [None]:
# data types 
# nominal -> consists of category names; can compare the equality of values; when visualizing we should readily be able to see if values are the same or different posiiton 
# ordinal -> consists of values that have a specific ordering; can compare the rank ordering of values; when visualizing we should be able to perceive a sense of rank order 
# quantitative -> able to measure numerical differences among values; 
#   for interval data, we can measure the distance between points 
#   for ratio data, the zero point is meaningful and so we can alos measure proportions or scale values 
#   can be visualized using position, size, or color values 
#   axises with zero baselines are essential for visualizing ratio data
# temporal -> measures time points or intervals

In [None]:
data.dtypes

In [None]:
alt.Chart(data).mark_point().encode(
    alt.X('lifeExp:Q'))

In [None]:
alt.Chart(data).mark_point().encode(
    alt.X('lifeExp:N'))

In [None]:
alt.Chart(data).mark_point().encode(
    alt.X('lifeExp:O'))

In [None]:
# encoding channels: 
# x -> horizontal position of the mark 
# y -> vertical position of the mark 
# size -> size of the mark; may correspond to area or length 
# color -> mark color; specified a a legal CSS color 
# opacity -> mark opacity ranging from 0 (full transparent) to 1 (fully opaque)
# shape -> plotting symbol shape for point marks 
# tooltip -> specifies that text should be displayed upon the mouse hovering over the mark 
# order -> mark ordering; determines line/area point order and drawing order 
# column -> facet the data into horizontally aligned subplots 
# row -> facet the data into vertically aligned subplots 


In [None]:
alt.Chart(data).mark_point().encode(
    alt.X('lifeExp:Q'))

In [None]:
alt.Chart(data).mark_point().encode(
    alt.X('lifeExp:Q'),
    alt.Y('continent:O'))

In [None]:
alt.Chart(data).mark_point().encode(
    alt.X('lifeExp:Q'),
    alt.Y('pop:Q'))

In [None]:
alt.Chart(data).mark_point().encode(
    alt.X('lifeExp:Q', scale = alt.Scale(zero = False)),
    alt.Y('pop:Q', scale = alt.Scale(zero = False)))

In [None]:
alt.Chart(data).mark_point().encode(
    alt.X('gdpPercap:Q'),
    alt.Y('lifeExp:Q'),
    alt.Size('pop:Q'))

In [None]:
alt.Chart(data).mark_point().encode(
    alt.X('gdpPercap:Q'),
    alt.Y('lifeExp:Q'),
    alt.Size('pop:Q', scale = alt.Scale(range = [0, 1000])))

In [None]:
alt.Chart(data).mark_point().encode(
    alt.X('lifeExp:Q'),
    alt.Y('pop:Q'),
    alt.Color('continent:N'))

In [None]:
alt.Chart(data).mark_point(filled = True).encode(
    alt.X('lifeExp:Q'),
    alt.Y('pop:Q'),
    alt.Color('continent:N'))

In [None]:
alt.Chart(data).mark_point(filled = True, opacity = 0.5).encode(
    alt.X('lifeExp:Q'),
    alt.Y('pop:Q'),
    alt.Color('continent:N'))

In [None]:
alt.Chart(data).mark_point(filled = True, opacity = 0.5).encode(
    alt.X('lifeExp:Q'),
    alt.Y('pop:Q'),
    alt.Shape('continent:N'))

In [None]:
alt.Chart(data).mark_point(filled = True, opacity = 0.5).encode(
    alt.X('lifeExp:Q'),
    alt.Y('pop:Q'),
    alt.Color('continent:N'),
    alt.Tooltip('country'))

In [None]:
alt.Chart(data).mark_point(filled = True, opacity = 1).encode(
    alt.X('gdpPercap:Q'),
    alt.Y('lifeExp:Q'),
    alt.Size('pop:Q', scale = alt.Scale(range = [0, 1000])),
    alt.Color('continent:N'),
    alt.Tooltip('country:N'),
    alt.Order('pop:Q', sort = 'descending'))

In [None]:
alt.Chart(data).mark_point(filled = True, opacity = 1).encode(
    alt.X('gdpPercap:Q'),
    alt.Y('lifeExp:Q'),
    alt.Size('pop:Q', scale = alt.Scale(range = [0,1000])),
    alt.Color('continent:N'),
    alt.Order('pop:Q', sort = 'descending'),
    alt.Tooltip(['country:N', 'year:O']))

In [None]:
alt.Chart(data).mark_point(filled = True, opacity = 1).encode(
    alt.X('gdpPercap:Q'),
    alt.Y('lifeExp:Q'),
    alt.Size('pop:Q', scale = alt.Scale(range = [0,1000])),
    alt.Color('continent:N'),
    alt.Order('pop:Q', sort = 'descending'),
    alt.Column('year'))

In [None]:
alt.Chart(data).mark_point(filled = True, opacity = 1).encode(
    alt.X('gdpPercap:Q'),
    alt.Y('lifeExp:Q'),
    alt.Size('pop:Q', scale = alt.Scale(range = [0,1000])),
    alt.Color('continent:N'),
    alt.Order('pop:Q', sort = 'descending'),
    alt.Column('year')).properties(width = 135, height = 135)

In [None]:
# graphical marks 
# mark_area() -> filled areas defined bya top line and a baseline 
# mark_bar() -> rectangular bars 
# mark_circle() -> scallter plot points as filled circles
# mark_line() -> connected line segments 
# mark_point() -> scatter plot points with configurable shapes 
# mark_rect() -> filled rectangles; useful for heatmaps 
# mark_rule() -> vertical or horizontal lines spanning the axis 
# mark_square() -> scatter plot points as filled squares 
# mark_text() -> scatter plot points represented by text 
# mark_tick() -> vertical or horizontal tick marks 

In [None]:
alt.Chart(data).mark_point().encode(
    alt.X('lifeExp:Q'),
    alt.Y('continent:N'),
    alt.Shape('continent:N'))

In [None]:
alt.Chart(data).mark_point(filled = True, size = 100).encode(
    alt.X('lifeExp:Q'),
    alt.Y('continent:N'),
    alt.Shape('continent:N'))

In [None]:
alt.Chart(data).mark_circle().encode(
    alt.X('lifeExp:Q'),
    alt.Y('continent:N'),
    alt.Shape('continent:N'))

In [None]:
alt.Chart(data).mark_square().encode(
    alt.X('lifeExp:Q'),
    alt.Y('continent:N'),
    alt.Shape('continent:N'))

In [None]:
alt.Chart(data).mark_tick().encode(
    alt.X('lifeExp:Q'),
    alt.Y('continent:N'),
    alt.Shape('continent:N'))

In [None]:
alt.Chart(data).mark_bar().encode(
    alt.X('lifeExp:Q'),
    alt.Y('continent:N'))

In [None]:
alt.Chart(data).mark_bar().encode(
    alt.X('average(lifeExp)'),
    alt.Y('continent:N'))

In [None]:
alt.Chart(data).mark_bar().encode(
    alt.X('lifeExp:Q'),
    alt.Y('continent:N'),
    alt.Color('country'))

In [None]:
alt.Chart(data).mark_bar().encode(
    alt.X('year:O'),
    alt.Y('lifeExp:Q'),
    alt.Color('continent:N'))

In [None]:
alt.Chart(data).mark_bar().encode(
    alt.X('min(lifeExp):Q'),
    alt.X2('max(lifeExp):Q'),
    alt.Y('continent:N'))

In [None]:
alt.Chart(data).mark_line().encode(
    alt.X('year:O'),
    alt.Y('average(lifeExp):Q'),
    alt.Color('continent:N', legend = None)).properties(width = 400)

In [None]:
alt.Chart(data).mark_line(strokeWidth = 3, opacity = 0.5, interpolate = 'monotone').encode(
    alt.X('year:O'),
    alt.Y('lifeExp:Q'),
    alt.Color('country:N', legend = None)).properties(width = 400)

In [None]:
dataUS = data.loc[data['country'] == 'United States']

alt.Chart(dataUS).mark_area().encode(
    alt.X('year:O'),
    alt.Y('gdpPercap:Q'))

In [None]:
dataNA = data.loc[
    (data['country'] == 'United States') |
    (data['country'] == 'Canada') |
    (data['country'] == 'Mexico')]

alt.Chart(dataNA).mark_area().encode(
    alt.X('year:O'),
    alt.Y('gdpPercap:Q'),
    alt.Color('country:N'))

In [None]:
alt.Chart(dataNA).mark_area().encode(
    alt.X('year:O'),
    alt.Y('gdpPercap:Q', stack = 'center'),
    alt.Color('country:N'))

In [None]:
alt.Chart(dataNA).mark_area(opacity = 0.5).encode(
    alt.X('year:O'),
    alt.Y('gdpPercap:Q', stack = None),
    alt.Color('country:N'))

In [None]:
alt.Chart(dataNA).mark_area().encode(
    alt.X('year:O'),
    alt.Y('min(gdpPercap):Q'),
    alt.Y2('max(gdpPercap):Q'))

In [None]:
# 32. Data Transformation 

In [None]:
movies = pd.read_csv('data/movies.csv')
movies.shape 

In [None]:
movies.head(5)

In [None]:
alt.Chart(movies).mark_circle().encode(
    alt.X('Rotten_Tomatoes_Rating:Q'),
    alt.Y('IMDB_Rating:Q'))

In [None]:
movies['binned_rating'] = pd.cut(movies['Rotten_Tomatoes_Rating'], 20).apply(lambda x: x.mid)
movies.dropna().head()

In [None]:
df = movies.groupby('binned_rating')[['Title']].count().reset_index()
df.head()

In [None]:
alt.Chart(df).mark_bar().encode(
    alt.X('binned_rating:Q'),
    alt.Y('Title:Q'))

In [None]:
alt.Chart(movies).mark_circle().encode(
    alt.X('Rotten_Tomatoes_Rating:Q', bin = True),
    alt.Y('IMDB_Rating:Q'))

In [None]:
alt.Chart(movies).mark_bar().encode(
    alt.X('Rotten_Tomatoes_Rating:Q', bin = True),
    alt.Y('count()'))

In [None]:
alt.Chart(movies).mark_bar().encode(
    alt.X('Rotten_Tomatoes_Rating:Q', bin = alt.BinParams(maxbins = 20)),
    alt.Y('count()'))

In [None]:
alt.Chart(movies).mark_bar().encode(
    alt.X('IMDB_Rating:Q', bin = alt.BinParams(maxbins = 20)),
    alt.Y('count()'))

In [None]:
alt.Chart(movies).mark_circle().encode(
    alt.X('Rotten_Tomatoes_Rating:Q', bin=alt.BinParams(maxbins=20)),
    alt.Y('IMDB_Rating:Q', bin=alt.BinParams(maxbins=20)))

In [None]:
alt.Chart(movies).mark_circle().encode(
    alt.X('Rotten_Tomatoes_Rating:Q', bin=alt.BinParams(maxbins=20)),
    alt.Y('IMDB_Rating:Q', bin=alt.BinParams(maxbins=20)),
    alt.Size('count()')
)

In [None]:
alt.Chart(movies).mark_bar().encode(
    alt.X('Rotten_Tomatoes_Rating:Q', bin = alt.BinParams(maxbins = 20)),
    alt.Y('IMDB_Rating:Q', bin = alt.BinParams(maxbins = 20)),
    alt.Color('count()'))

In [None]:
alt.Chart(movies).mark_bar().encode(
    alt.X('average(Rotten_Tomatoes_Rating):Q'),
    alt.Y('Major_Genre:N'))

In [None]:
alt.Chart(movies).mark_bar().encode(
    alt.X('average(Rotten_Tomatoes_Rating):Q'),
    alt.Y('Major_Genre:N', sort = alt.EncodingSortField(
        op = 'average', field = 'Rotten_Tomatoes_Rating', order = 'descending')))

In [None]:
alt.Chart(movies).mark_bar().encode(
    alt.X('median(Rotten_Tomatoes_Rating):Q'),
    alt.Y('Major_Genre:N', sort = alt.EncodingSortField(
        op = 'median', field = 'Rotten_Tomatoes_Rating', order = 'descending')))

In [None]:
alt.Chart(movies).mark_bar().encode(
    alt.X('q1(Rotten_Tomatoes_Rating):Q'),
    alt.X2('q3(Rotten_Tomatoes_Rating):Q'),
    alt.Y('Major_Genre:N', sort = alt.EncodingSortField(
        op = 'median', field = 'Rotten_Tomatoes_Rating', order = 'descending')))

In [None]:
alt.Chart(movies).mark_area().encode(
    alt.X('month(Release_Date):T'),
    alt.Y('median(US_Gross):Q'))

In [None]:
alt.Chart(movies).mark_area().encode(
    alt.X('month(Release_Date):T'),
    alt.Y('median(Worldwide_Gross):Q'))

In [None]:
# 36. Tokenization