In [None]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import HTML

%matplotlib inline

## Visualizations

Matplotlib:
http://matplotlib.org/gallery.html

Seaborn:
http://web.stanford.edu/~mwaskom/software/seaborn/examples/index.html

Plot.ly:
https://plot.ly/python/

Tufte:

http://www.edwardtufte.com/tufte/


Flat Colors:

http://flatuicolors.com/#

# Matplotlib

In [None]:
plt.plot([0,1], [0,1])
plt.title("Here's a really basic plot", size=24)

### What about two plots

In [None]:
fig = plt.figure(figsize=(10,6))

ax = fig.add_subplot(2,1,1)
# Random number between 1 and 100
ax.scatter([np.random.randint(1,100) for x in range(0,1000)], np.ones(1000) * 2, alpha=0.1, s=15)


ax = fig.add_subplot(2,1,2)
# Random sample for a normal distribution
ax.scatter([np.random.normal() for x in range(0, 1000)], np.ones(1000) * 2, alpha=0.1, s=15)


## NFL Combine datasets

In [None]:
combine_data = pd.read_csv("../data/combine.csv")
combine_data = combine_data.dropna()  # dropna() is crude - be careful with it
combine_data['HeightInchesTotal'] = combine_data['HeightFeet'] * 12 + combine_data['HeightInches']
combine_data.head()

# Histogram
* Examine the distributionof a single variable
* bins data into intervals

# KDE - Kernel Density estimate
* Seaborn
* probability density function (normalization)

In [None]:
fig = plt.figure(figsize=(20,10))

ax = fig.add_subplot(1,2,1)

ax.hist(combine_data.FortyYD[(combine_data.FortyYD > 0)].values)

ax.set_title("40 times - combine participants", size=32)
ax.set_xlabel("Forty Yard Dash Time (seconds)", size=24)
ax.set_ylabel("Count", size=24)
ax.tick_params(axis='both', which='major', labelsize=16)


ax = fig.add_subplot(1,2,2)

sns.kdeplot(combine_data.FortyYD[(combine_data.FortyYD > 0)].values, shade=True, color="#3498db")

ax.set_title("40 times - combine participants", size=32)
ax.set_xlabel("Forty Yard Dash Time (seconds)", size=24)
ax.set_ylabel("Density", size=24)
ax.tick_params(axis='both', which='major', labelsize=16)


print

# Class exercise - plot 40 times by position using a KDE plot

- positions to plot WR, CB, DE, OT

## Boxplots

- used to look at multiple variable distributions
- indicates dispersion and outliers
- bottom of box 25th (Q1) percentile, top 75th (Q3) percentile, whiskers different for each boxplot, usually 1.5 * IQR 
- IQR - Q3 - Q1

In [None]:
#build a dataset
positions = ['WR','CB','RB','DE','DT','OT']
weight_vector, speed_vector = [], []
for position in positions:
    weight_vector.append(combine_data.Weight[(combine_data.Position == position)].values)
    speed_vector.append(combine_data.FortyYD[(combine_data.Position == position)].values)

fig = plt.figure(figsize=(18,15))
fig.subplots_adjust(hspace=0.5)
sns.set_style('whitegrid')

#subplots - (column, row, plot index)
ax = fig.add_subplot(2,1,1)
ax.boxplot(weight_vector)
ax.set_title("Weight Distribution", size=32)
ax.set_xticklabels(positions, size=20)
ax.tick_params(axis='y', which='major', labelsize=16)

ax = fig.add_subplot(2,1,2)
ax.boxplot(speed_vector)
ax.set_title("Speed Distribution", size=32)
ax.set_xticklabels(positions, size=20)
ax.tick_params(axis='y', which='major', labelsize=16)

print

## Scatterplot

- explore the relation between two variables

In [None]:
weight_vector, height_vector = [], []
position_vector = ['CB','WR','RB','DE','DT','OT']
#color_vector = ['#e74c3c','#e67e22','#f1c40f','#2ecc71','#3498db','#9b59b6']
#position_color = {x:y for x, y in zip(position_vector, color_vector)}

fig = plt.figure(figsize=(20,12))
sns.set_style('whitegrid')
ax = fig.add_subplot(1,1,1)

for position in position_vector:
    position_subset = combine_data['Position'] == position
    x_vector = combine_data.HeightInchesTotal[position_subset].values
    #x_vector = [x + random.random() for x in x_vector]
    
    y_vector = combine_data.Weight[position_subset].values
    #ax.scatter(x_vector, y_vector, color=position_color[position])

    ax.scatter(x_vector, y_vector)


ax.set_title("Height Weight of NFL Combine Participants by Position", size=28)
ax.set_ylabel("Weight", size=24)
ax.set_xlabel("Height (Inches)", size=24)
ax.legend(position_vector, fontsize=20, loc=2)

ax.tick_params(axis='both', which='major', labelsize=16)


print