# Clustering: part deux

In [1]:
%matplotlib notebook
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
import warnings
from matplotlib.animation import FuncAnimation
from IPython.display import YouTubeVideo
plt.rcParams["font.size"] = 16
plt.rcParams["animation.html"] = 'none'
warnings.filterwarnings('ignore')

In [2]:
YouTubeVideo('vAjgSTv9MsY')

![bhot](https://upload.wikimedia.org/wikipedia/en/a/a3/BriefHistoryTime.jpg)

## Student clustering assignments

## 1) Football

![football-players](images/football-players.jpg)

In [3]:
# Load the data
df = pd.read_csv('data/football-players.csv')
# Only keep certain columns
df = df[['No.','Player','Pos','Age','Ht','Wt','Cap Hit']]
# Cleanup the player names
df['Player'] = df['Player'].map(lambda x:x.split('\\')[0].replace('*+','').replace('*',''))
# Make 'Player' the index
df = df.set_index('Player')
# Remove players with no position or height
df = df[df['Pos'].notnull()]
df = df[df['Ht'].notnull()]
# Capitalize all positions
df['Pos'] = df['Pos'].map(lambda x:x.upper())
# Turn all salaries into integers
df['Cap Hit'] = df['Cap Hit'].map(lambda x:int(x[1:]))
def height_to_inches(height):
    feet,inches = height.split('-')
    return int(feet)*12 + int(inches)
# Convert heights from feet and inches to total inches
df['Ht'] = df['Ht'].map(height_to_inches)
# Remove players with no height
df = df[df['Ht']>0]
# Turn jersey numbers into integers
df['No.'] = df['No.'].astype(int)
# Give columns intuitive names
df = df.rename(columns={'Ht':'Height','Wt':'Weight','Cap Hit':'Salary','No.':'Number'})
# Only keep the offensive players
all_positions =['QB','RB','WR','TE','C','LG','RG','LT','RT']
df = df[df['Pos'].isin(all_positions)]

In [4]:
# Show the first few rows
df.head()

Unnamed: 0_level_0,Number,Pos,Age,Height,Weight,Salary
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Evan Boehm,70,RG,24,74,310,540000
Alex Boone,75,LG,30,79,330,1400000
Jaron Brown,13,WR,27,74,205,775000
Larry Fitzgerald,11,WR,34,75,225,11000000
Blaine Gabbert,7,QB,28,77,235,900000


In [5]:
def plot_football(df,positions,color=None,labels=None,ax=None):
    df_pos = df[df['Pos'].isin(positions)]
    if color is None:
        ax = df_pos.plot.scatter(x='Weight',y='Height',c=labels,s=25,cmap='rainbow',colorbar=None,ax=ax)
    else:
        ax = df_pos.plot.scatter(x='Weight',y='Height',c=color,s=25,ax=ax)
    return ax

In [6]:
kmeans = KMeans(n_clusters=4,random_state=0)
kmeans.fit(df[['Height','Weight']]);

In [7]:
plot_football(df,all_positions,labels=kmeans.labels_.astype('float'));

<IPython.core.display.Javascript object>

In [8]:
ax = None
for positions,color in [(('QB','WR'),'red'),(('RB',),'cyan'),(('TE',),'blueviolet'),(('C','LG','RG','LT','RT'),'khaki')]:
    ax = plot_football(df,positions,color=color,ax=ax)

<IPython.core.display.Javascript object>

In [9]:
kmeans.fit(df[['Height','Weight','Number']]);

In [10]:
plot_football(df,all_positions,labels=kmeans.labels_.astype('float'));

<IPython.core.display.Javascript object>

# Student clustering assignments
## 2) The labor force

![hours-worked](images/hours-worked.jpg)

In [11]:
# From https://www.bls.gov/tus/special.requests/atussum_2016.zip
df = pd.read_csv('data/atussum_2016.dat')
# Give columns intuitive names
df = df.rename(columns={'TEAGE':'Age','TEHRUSLT':'Hours Worked'})
# Remove individuals who didn't work
df = df[df['Hours Worked'] > 0]
# Say how many people there are
print("This has data on %d people" % df.shape[0])
# Show the first few rows for just the 2 columns we care about
df.head()[['Age','Hours Worked']]

This has data on 5964 people


Unnamed: 0,Age,Hours Worked
3,31,32
4,59,12
9,39,46
10,35,55
11,54,38


In [12]:
df.plot.scatter(x='Age',y='Hours Worked',s=1);

<IPython.core.display.Javascript object>

In [13]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(df[['Age','Hours Worked']]);

In [14]:
df.plot.scatter(x='Age',y='Hours Worked',s=1,c=kmeans.labels_,cmap='rainbow',colorbar=None);

<IPython.core.display.Javascript object>

## Which variables should you cluster on?

![evolving-clusters1](images/evolving_clusters1.jpg)

![evolving-clusters2](images/evolving_clusters2.jpg)

## Consider not using a pivotal variable like time as a feature to cluster.  

## Instead, you can try showing how the clustering changes over time using a movie.

In [15]:
YouTubeVideo('jbkSRLYSojo')

In [16]:
plt.rcParams["animation.html"] = 'jshtml'
kmeans = KMeans(n_clusters=2,random_state=0)
fig, ax = plt.subplots()
ax.axis([-10,10,-10,10])
sc = ax.scatter([],[])
ax.set_xlabel('Some variable X')
ax.set_ylabel('Some variable Y')

def animate(i):
    x = np.random.randn(50)
    y = np.hstack((np.random.randn(25),i+np.random.randn(25)))
    xy = np.c_[x,y]
    kmeans.fit(xy)
    sc.set_offsets(xy)
    colors = [['red','blue'][i] for i in kmeans.labels_]
    sc.set_facecolor(colors)

plt.tight_layout()
ani = FuncAnimation(fig, animate, frames=np.arange(0,7,0.1))
#ani

<IPython.core.display.Javascript object>