In [1]:
import pandas as pd
import altair as alt

In [2]:
from vega_datasets import data

In [3]:
cars = data.cars()
cars.head()

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,1970-01-01,USA
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,1970-01-01,USA
4,ford torino,17.0,8,302.0,140.0,3449,10.5,1970-01-01,USA


### Fundamental Object in Altair: Chart
- Takes in a df as a single argument

In [4]:
chart = alt.Chart(cars) # Not enough info - will give error

In [5]:
alt.Chart(cars).mark_point() # I am going to make a plot with a point mark

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


All the circles are overlapping because `alt` does not recognize a difference

In [6]:
alt.Chart(cars).mark_point().encode(
    y="Miles_per_Gallon"
) # Returns a circle for every possible value of miles_per_gallon

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [7]:
alt.Chart(cars).mark_point().encode(
    y="Miles_per_Gallon",
    x="Horsepower"
) # Now this returns a scatter plot with x = horsepower and y = mpg and each point is represented by a circle

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [8]:
alt.Chart(cars).mark_point().encode(
    y="Miles_per_Gallon",
    x="Cylinders" # Categorical, ordered data
) 

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


### Different types of attributes:
- `':N'` indicates a nominal type (unordered, categorical)
- `':O'`: indicates an ordinal type (ordered, categorical)
- `':Q'`: indicates quantitative data
- `':T'`: indicates time data

In [9]:
alt.Chart(cars).mark_point().encode(
    y = alt.Y("Miles_per_Gallon:Q"),
    x = alt.X("Cylinders:N")
) # Produces a different plot than before - only shows range of cylinders

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


### Data Transformation: Aggregation and Filter
You can generate plot of summary statistics withotu calculating it using
- sum
- median
- average
- q1, q3
- min, max
You can also include filter to exclude any part of the data or use calculate to generate a new attribute to the plot

In [10]:
alt.Chart(cars).mark_point().encode(
    y = alt.Y("average(Miles_per_Gallon):Q"),
    x = alt.X("Cylinders:N")
) 


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [11]:
alt.Chart(cars).mark_point().encode(
    y = alt.Y("count():Q"), # Counts how many cars there are for each quantity of cylinders
    x = alt.X("Cylinders:N")
) 

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [12]:
# Remove data points where the year <1975
alt.Chart(cars).mark_point().transform_filter("year(datum.Year < 1975)").encode(
    y = alt.Y("average(Miles_per_Gallon):Q"),
    x = alt.X("Cylinders:N")
) 

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [13]:
# Same as before
alt.Chart(cars).mark_point().encode(
    y = alt.Y("average(Miles_per_Gallon):Q"),
    x = alt.X("Cylinders:N")
).transform_filter("year(datum.Year < 1975)")

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [14]:
# Transforming a column with a calculation
alt.Chart(cars).mark_point().transform_calculate(
    Horsepower10 = 'datum.Horsepower * 10'
).encode(
    y = alt.Y("average(Miles_per_Gallon):Q"),
    x = alt.X("Horsepower10:Q")
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [15]:
# Take log of horsepower
alt.Chart(cars).mark_point().transform_calculate(
    Horsepowerlog = 'log(datum.Horsepower)'
).encode(
    y = alt.Y("average(Miles_per_Gallon):Q"),
    x = alt.X("Horsepowerlog:Q")
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [16]:
# No transformation on the data, transforms log scale
alt.Chart(cars).mark_point().encode(
    y = alt.Y("average(Miles_per_Gallon):Q"),
    x = alt.X("Horsepower:Q", scale=alt.Scale(type='log'))
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [17]:
alt.Chart(cars).mark_point().encode(
    x = alt.X("Horsepower:Q", title="Horsepower"),
    y = alt.Y("Miles_per_Gallon:Q", title="Miles Per Gallon")
).properties(
    title="Horspower vs. Miles per Gallon",
    width=150, height=150
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [18]:
iris = data.iris()
iris.head()
iris.describe()

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [19]:
# Create a basic scatter plot between petalLength and petalWidth. 
# Then we will apply a transformation to normalize the petalLength and petalWidth before encoding them into a scatter plot.
# Normalizing data can help in comparing features that have different scales.
alt.Chart(iris).mark_point().transform_calculate(
    norm_petalLength = '(datum.petalLength - 3.758000) / 1.765298',
    norm_petalWidth = "(datum.petalWidth - 1.199333) / 0.762238"
).encode(
    x=alt.X("norm_petalLength:Q", title="Petal Length"),
    y=alt.Y("norm_petalWidth:Q", title="Petal Width")
).properties(title="Petal Length vs Petal Width (Normalized)")

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [20]:
# Use size attribute to represent another attribute
alt.Chart(cars).mark_point().encode(
    alt.X("Horsepower:Q", title = "Horsepower"),
    alt.Y("Miles_per_Gallon:Q", title = "Miles per Gallon"),
    alt.Size("Acceleration:Q"),
    alt.Color("Cylinders:O"), # If you use Cylinders:Q, it will do a color hue bar # If you do Cylinders:N, it will do random colors
    alt.OpacityValue(0.5),
    alt.Shape("Origin:N"),
    alt.Order("Acceleration", sort="descending"),
    alt.Tooltip(["Miles_per_Gallon", "Origin", "Acceleration"]) # technically not an interactive function
).properties(
    title = "Horsepower vs. Miles per Gallon",
    width = 120, height = 120
).facet(
    facet = "Cylinders:N",
    columns = 3
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [21]:
# mark_circle() represents a solid point
alt.Chart(cars).mark_circle(color="red").encode(
    alt.X("Horsepower:Q", title = "Horsepower"),
    alt.Y("Miles_per_Gallon:Q", title = "Miles per Gallon"),
).properties(
    title = "Horsepower vs. Miles per Gallon",
    width = 300, height = 300
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [22]:
alt.Chart(cars).mark_square(color="red", angle=45).encode(
    alt.X("Horsepower:Q", title = "Horsepower"),
    alt.Y("Miles_per_Gallon:Q", title = "Miles per Gallon"),
).properties(
    title = "Horsepower vs. Miles per Gallon",
    width = 300, height = 300
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [23]:
alt.Chart(cars).mark_tick().encode(
    alt.X("Horsepower:Q", title = "Horsepower"),
    alt.Y("Miles_per_Gallon:Q", title = "Miles per Gallon"),
).properties(
    title = "Horsepower vs. Miles per Gallon",
    width = 300, height = 300
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [24]:
alt.Chart(cars).mark_line(
    color = "red",
    strokeWidth = 3, 
    interpolate = "monotone"
).encode(
    alt.Y("average(Miles_per_Gallon)"),
    alt.X("Cylinders:N")
).properties(width=200, height=200)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [None]:
alt.Chart(cars).mark_area(
    color = "red",
    strokeWidth = 3, 
    interpolate = "monotone",
    opacity = 0.5
).encode(
    alt.Y("average(Miles_per_Gallon)"),
    alt.X("Cylinders:N"),
    alt.Color("Origin:N")
).properties(width=200, height=200)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [26]:
# How to make a boxplot
alt.Chart(cars).mark_boxplot().encode(
    alt.Y("Miles_per_Gallon"),
    alt.X("Cylinders:N")
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [27]:
alt.Chart(cars).mark_bar().encode(
    alt.X("Miles_per_Gallon", bin = alt.BinParams(maxbins=10)),
    alt.Y("count()")
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


### Adding multiple plots together and save

In [28]:
# Sample line plot
point = alt.Chart(cars).mark_circle().encode(
    alt.X("Year"),
    alt.Y("average(Miles_per_Gallon)")
)

line = alt.Chart(cars).mark_line().encode(
    alt.X("Year"),
    alt.Y("average(Miles_per_Gallon)")
)

In [29]:
# Draw both plots on top of each other
line + point

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [30]:
# Both plots, side-by-side
line | point

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [31]:
# Save chart to HTML file
chart = line + point
chart.save("chart.html")

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [32]:
chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
