In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#let's see what's inside
df=pd.read_csv('../input/nfl-2020-combine/NFL_2020_Combine.csv')
df

Oops! Two index columns. Annoing. 

In [None]:
#Getting rid of the index column
del df['Unnamed: 0']
df

Analysing missing data

In [None]:
#cell returns True when no value
missing=df.isna()
missing

In [None]:
#percentage of missing data
missing.mean()*100

In [None]:
# plotting the data that is present, %
import matplotlib.pyplot as plt
mi_me=100-missing.mean()*100
plt.figure()
mi_me.plot.bar()

We need to deal with the missing values. Let's create a subset with only non-null values, and another set where NaNs are replaced with zeros.

In [None]:
# Purify!!!
df_clean=df.dropna()
df_clean

In [None]:
# Nullify!!!
df_zero=df.fillna(0)
df_zero

I think, for visuals and because all players have data on their weight and height, we need to build the two dimensional set.

In [None]:
df_HW=df.iloc[:,:4]
df_HW

In [None]:
# plot the players
plt.figure()
plt.scatter(df_HW.Ht,df_HW.Wt)
plt.xlabel('Height, cm')
plt.ylabel('Weight, lb')
plt.show

Now we'll do some cluster analysis.

In [None]:
from scipy.cluster.hierarchy import linkage,dendrogram
Z_HW=linkage(df_HW[['Ht','Wt']],'ward')
plt.figure(figsize=(15,10))
dendrogram(Z_HW)
plt.show

Because the dendrogram shows all the points,it is a bit messy. Let's clean it up.

In [None]:
plt.figure(figsize=(15,10))
dendrogram(Z_HW, truncate_mode='lastp')
plt.show

In [None]:
# make flat clusters
from scipy.cluster.hierarchy import fcluster
df_HW['Clusters']=fcluster(Z_HW,12,criterion='maxclust')
df_HW

In [None]:
import seaborn as sns
plt.figure(figsize=(10,10))
sns.scatterplot(data=df_HW,x='Ht',y='Wt',hue='Clusters', palette='dark')

Let us see how many positions we have here.

In [None]:
df_HW.Pos.unique()

In [None]:
df_HW.Pos.nunique()

To compare our clusters to positions, we'll plot positions.

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(data=df_HW,x='Ht',y='Wt',hue='Pos', palette='dark')

I suppose we can reduce number of positions by merging some similar positions (in terms of players overall fitness). We'll blend OL and DL, K and P

In [None]:
# specify colors 
pos_colors={'OL':'#FFFF00', 'DL':'#FFFF00', 'RB':'#00FFFF', 'WR':'#FF00FF', 'CB':'#00FF00', 'TE':'#0000ff', 'LB':'#FF7F00','K':'#FF0000', 'S':'#4B0082', 'QB':'#FF00FF', 'P':'#FF0000',
               'LS':'#000000'}

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(data=df_HW,x='Ht',y='Wt',hue='Pos', palette=pos_colors)

In [None]:
# reducing number of clusters
df_HW['Clusters']=fcluster(Z_HW,10,criterion='maxclust')
plt.figure(figsize=(10,10))
sns.scatterplot(data=df_HW,x='Ht',y='Wt',hue='Clusters', palette='Paired')

In [None]:
# making clusters for full sets
Z_clean=linkage(df_clean.drop(columns=['Player', 'Pos']), 'ward')
Z_zero=linkage(df_zero.drop(columns=['Player', 'Pos']), 'ward')

In [None]:
df_clean['Clusters']=fcluster(Z_clean,10,criterion='maxclust')
df_clean

We ignore the warning, because we only want to modify df_clean.

In [None]:
df_clean.reset_index(drop=True, inplace=True)
df_clean

In [None]:
df_zero['Clusters']=fcluster(Z_zero,10,criterion='maxclust')
df_zero

At this point I came back and changed linkage from 'complete' to 'Ward', as it creates denser clusters, less clusters from outliers, in our case it seems a better approach. 

In [None]:
pd.crosstab(df_HW.Pos, df_HW.Clusters)

In [None]:
pd.crosstab(df_clean.Pos, df_clean.Clusters)

In [None]:
pd.crosstab(df_zero.Pos, df_zero.Clusters)

It seems like the best results we have with the 'zero' approach. But let's check it.

In [None]:
# define Cramer's V
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [None]:
import scipy.stats as ss
print(cramers_v(df_HW.Pos, df_HW.Clusters))
print(cramers_v(df_clean.Pos, df_clean.Clusters))
print(cramers_v(df_zero.Pos, df_zero.Clusters))

Interestingly, the strongest correlation is where we used 'clean' approach. Maybe it is due to less positions.

In [None]:
# number of positions
df_clean.Pos.nunique()

In [None]:
# Changing number of clusters.
df_clean['Clusters']=fcluster(Z_clean,8,criterion='maxclust')
df_clean

In [None]:
print(cramers_v(df_clean.Pos, df_clean.Clusters))

Hmm... That way we even reduced the correlation. What if we increase the number of clisters?

In [None]:
df_clean['Clusters']=fcluster(Z_clean,16,criterion='maxclust')

In [None]:
print(cramers_v(df_clean.Pos, df_clean.Clusters))

In [None]:
# merging OL and DL
df_clean.replace('OL','InnL',inplace=True)
df_clean.replace('DL','InnL', inplace=True)
df_clean

In [None]:
df_clean['Clusters']=fcluster(Z_clean,7,criterion='maxclust')

In [None]:
print(cramers_v(df_clean.Pos, df_clean.Clusters))

I am not sure here, I think this number says that we have the same probability of correctly guessing the position from scouting combine stats. 

In [None]:
pd.crosstab(df_clean.Pos, df_clean.Clusters)

In [None]:
# We can safely merge clusters 1,2,6
# merging OL and DL
df_clean.Clusters.replace(2,1,inplace=True)
df_clean.Clusters.replace(6,1, inplace=True)
df_clean.Clusters


In [None]:
print(cramers_v(df_clean.Pos, df_clean.Clusters))

In [None]:
pd.crosstab(df_clean.Pos, df_clean.Clusters)

In [None]:
# As we want square matrix here, let's merge WR, S, RB
df_clean.replace('WR','WRSRB',inplace=True)
df_clean.replace('S','WRSRB', inplace=True)
df_clean.replace('RB','WRSRB', inplace=True)


In [None]:
pd.crosstab(df_clean.Pos, df_clean.Clusters)

In [None]:
print(cramers_v(df_clean.Pos, df_clean.Clusters))