In [None]:
import numpy as np # matrix
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from shapely.geometry import Point, Polygon, MultiPolygon
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import to_rgba
import os

import warnings
warnings.filterwarnings('ignore')

# set the display of dataframes to stretch across pages 
pd.set_option('display.expand_frame_repr', False)
# set the display of float to have 2 decimal points
pd.set_option('display.precision', 2)

### Basic Plots
- line plot
- histogram
- bar plot
- correlation heat map
- scatter plot

## Line Plot

In [None]:
xs = np.arange(0, 10)
noise = np.random.normal(0, 0.2, size=(xs.shape))
plt.plot(xs + noise, color='blue', label='Blue temp')
plt.plot(xs + 2, color='red', label='red temp')
plt.legend(loc='upper left')
plt.xlabel('Time')
plt.ylabel('Temperature')
plt.title('Temperature over time')
plt.show()

## Histograms

Remember to always tweak the `bins` parameter.

In [None]:
xs = np.random.normal(0, 1, size=1000)
avg = np.mean(xs)
std = np.std(xs)
plt.axvline(x=1, color='red')
plt.axvline(x=-1, color='red')
plt.hist(xs, bins=40)
plt.show()

### Percentiles
Sometimes you just want a quick summary stats to get a handle on your data.

In [None]:
ps = [10, 25, 50, 75, 90]
for p, v in zip(ps, np.percentile(xs, ps)):
    print('{}th percentile: {:.2f}'.format(p, v))

## Bar Charts
- label tick marks

In [None]:
xs = np.random.random(size=7)
labels = ['M', 'T', 'W', 'Th', 'F', 'Sa', 'Su']
plt.bar(range(len(xs)), xs)
plt.xticks(range(len(xs)), labels)
plt.show()

## Correlation Heat Map

Given a set of features, you might be interested in taking a quick visual glance at which features are highly correlated. 
- construct the correlation matrix
- call `plt.imshow` on this correlation matrix
- use a `cmap` for the color gradient

In [None]:
mat = np.random.normal(size=(5, 5))
reds = plt.get_cmap("Reds")
plt.imshow(mat, cmap=reds)
plt.show()

In [None]:
xs = np.random.random(size=(100, 5))
corr_mat = np.zeros((5,5))
for i in range(5):
    for j in range(5):
        corr_mat[i,j] = np.corrcoef(xs[:, i], xs[:, j])[0, 1]
plt.imshow(corr_mat, cmap=plt.get_cmap("Reds"))

## Scatter
- plot two things against each other
- useful for eyeballing trends (ie: is there a linear trend between the two sets of data?)
- `alpha` controls opacity (1=opaque, 0=translucent)

In [None]:
X = np.random.normal(scale=1, size=(10000))
Y = np.random.normal(0, scale=0.5, size=X.shape)
for a in [0.04, 1]:
    X = np.random.normal(scale=1, size=(10000))
    plt.scatter(X, Y, alpha=a)
    # plt.annotate('origin', (0, 0))
    plt.title(f"alpha = {a}")
    plt.show()

In [None]:
X = np.random.normal(size=1000)
Y = np.random.normal(size=1000)
colors = np.square(X) + np.square(Y)
plt.scatter(X, Y, c=colors, cmap=reds)
plt.show()

### Plotting Maps
- load the SJ tracts file
- convert it to the correct the lat/long crs

In [None]:
sj_tracts = gpd.read_file('/kaggle/input/census-shapes/Census_Tract.shp')
sj_tracts = sj_tracts.to_crs('EPSG:4326') # convert to lat/long
sj_tracts.head(1)

Plot San Jose tracts color coded by population
- normalize population by the max tract population
- the `color` parameter must be a string or a list of the RGBA values
- we can get the RGBA values by using the color map that we got from `plt.get_cmap("Reds")`
    - the `reds` cmap is basically a function that takes in a scalar and returns the appropriate RGBA tuple (tuple of intensity of red/green/blue/alpha) that encodes a color

In [None]:
reds = plt.get_cmap("Reds")
max_pop = sj_tracts['POPULATION'].max()
colors = [reds(p / max_pop) for p in sj_tracts['POPULATION']]
sj_tracts.plot(color=colors)
plt.show()