In [3]:
import pandas as pd
import numpy as np
import time

from itertools import combinations
import networkx as nx

from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
import node2vec
from gensim.models import Word2Vec

In [4]:
''' load in data '''
games = pd.read_csv("../2023/data/games.csv")
players = pd.read_csv("../2023/data/players.csv")
plays = pd.read_csv("../2023/data/plays.csv")

ngs_df = pd.read_csv("../2023/data/week1.csv")
print(f"dataframe shape = {ngs_df.shape}")
for i in range(2,9):
    ngs_df = pd.concat([ngs_df,pd.read_csv(f"../2023/data/week{i}.csv") ])

dataframe shape = (1118122, 16)


In [5]:
embed_3_dim = pd.read_csv("2022_3_dim_full_player_embed_df.csv", index_col=0)
embed_9_dim = pd.read_csv("2022_9_dim_full_player_embed_df.csv", index_col=0)
embed_32_dim = pd.read_csv("2022_full_player_embed_df.csv", index_col=0)

In [6]:
def get_similarity_df(player_df):
    player_df = player_df.drop(['nflId', 'position'],axis=1)
    sim_player = cosine_similarity(player_df)
    sim_player_df = pd.DataFrame(sim_player, index=player_df.index, columns=player_df.index)
    return sim_player_df

In [61]:
print(f"original shape {embed_32_dim.shape}")
filtered_embed = embed_32_dim[~embed_32_dim.index.duplicated(keep='first')]
print(f"duplicates dropped shape {filtered_embed.shape}")

original shape (1684, 34)
duplicates dropped shape (1673, 34)


In [62]:
all_players_sim_df = get_similarity_df(filtered_embed)

In [63]:
from scipy.stats import percentileofscore

In [65]:
percentile_sim_df = all_players_sim_df.copy()

for player_name in all_players_sim_df.index:

    sim_row = all_players_sim_df.copy().loc[player_name, :]
    percentiles = sim_row.apply(lambda x: percentileofscore(sim_row.sort_values().values, x))
    percentile_sim_df.loc[player_name] = percentiles

In [67]:
#percentile_sim_df.to_csv("../statistics/output_data/percentile_similarity_full_df.csv")

In [68]:
percentile_sim_df.head()

displayName,David Long,Gabe Davis,Nick Scott,Troy Hill,Isaiah McKenzie,Football,Jalen Ramsey,Taylor Rapp,Stefon Diggs,Dion Dawkins,...,Nick Mullens,Joshua Onujiogu,Chandler Brewer,Ke'Shawn Vaughn,Monty Rice,Tyus Bowser,DeSean Jackson,Binjimen Victor,Zack Baun,Jordan Howard
displayName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
David Long,100.0,98.087268,76.2104,87.26838,84.997011,66.049014,97.250448,78.242678,90.854752,28.332337,...,32.57621,15.361626,7.352062,47.041243,0.358637,39.509863,79.438135,59.593545,63.060371,0.179319
Gabe Davis,98.565451,100.0,74.357442,89.479976,89.181112,66.766288,99.222953,78.601315,93.903168,28.451883,...,24.865511,15.421399,7.411835,45.248057,0.358637,38.792588,71.249253,50.747161,64.554692,0.179319
Nick Scott,61.267185,67.364017,100.0,92.707711,89.838613,64.913329,75.732218,99.641363,87.26838,28.870293,...,48.356246,13.269576,7.292289,57.142857,0.239091,38.135087,31.261207,18.828452,57.322176,0.298864
Troy Hill,76.389719,80.753138,85.953377,100.0,94.739988,65.152421,90.137478,89.479976,99.342499,28.571429,...,38.553497,14.106396,7.292289,52.420801,0.298864,38.314405,49.731022,30.783025,60.789002,0.239091
Isaiah McKenzie,69.456067,82.785415,87.089062,96.114764,100.0,68.141064,86.491333,94.441124,94.979079,28.571429,...,27.316198,14.704124,7.292289,48.296473,0.298864,38.254632,44.231919,23.7896,61.80514,0.179319


In [70]:
from sklearn.manifold import TSNE

embed_32_dropped = filtered_embed.drop(columns=['nflId', 'position'])
tsne = TSNE(n_components=2)
node_embeddings_2d = tsne.fit_transform(embed_32_dropped)
new_embeddings = pd.DataFrame(node_embeddings_2d, index=filtered_embed.index)
new_embeddings.columns = ['0', '1']
new_embeddings['nflId'] = filtered_embed['nflId']
new_embeddings['position'] = filtered_embed['position']


In [76]:
low_dim_all_players_sim_df = get_similarity_df(filtered_embed)
percentile_low_dim_sim_df = low_dim_all_players_sim_df.copy()

for player_name in low_dim_all_players_sim_df.index:

    sim_row = low_dim_all_players_sim_df.loc[player_name, :]
    percentiles = sim_row.apply(lambda x: percentileofscore(sim_row.sort_values().values, x))
    percentile_low_dim_sim_df.loc[player_name] = percentiles

In [None]:
percentile_low_dim_sim_df.to_csv("../statistics/output_data/percentile_similarity_low_dim_full_df.csv")

In [22]:
player_name = 'Stefon Diggs'

sim_row = all_players_sim_df.copy().loc[player_name, :]
percentiles = sim_row.apply(lambda x: percentileofscore(sim_row.sort_values().values, x))
all_players_sim_df.loc[player_name] = percentiles

In [24]:
sim_row.sort_values().values

array([0.33864566, 0.35492147, 0.37838793, ..., 0.99942032, 0.99964363,
       1.        ])

In [25]:
percentiles = sim_row.apply(lambda x: percentileofscore(sim_row.sort_values().values, x))

In [35]:
diggs_df = pd.concat([sim_row, percentiles],axis=1)
diggs_df.columns = ['Sim', 'Percentile']

In [41]:
diggs_df.sort_values("Sim", ascending=False)

Unnamed: 0_level_0,Sim,Percentile
displayName,Unnamed: 1_level_1,Unnamed: 2_level_1
Stefon Diggs,1.000000,100.000000
A.J. Brown,0.999644,99.940618
Tyler Lockett,0.999420,99.881235
Bradley Roby,0.998990,99.821853
Michael Thomas,0.998926,99.762470
...,...,...
Monty Rice,0.395817,0.296912
Jordan Howard,0.380931,0.237530
Joe Haeg,0.378388,0.178147
Jason Pinnock,0.354921,0.118765


In [44]:
percentiles

displayName
David Long         80.700713
Gabe Davis         85.748219
Nick Scott         81.353919
Troy Hill          98.931116
Isaiah McKenzie    91.033254
                     ...    
Tyus Bowser        38.420428
DeSean Jackson     53.325416
Binjimen Victor    36.460808
Zack Baun          59.560570
Jordan Howard       0.237530
Name: Stefon Diggs, Length: 1684, dtype: float64

In [47]:
all_players_sim_df.loc['Stefon Diggs'] = percentiles

In [48]:
all_players_sim_df.iloc[5:10] 

displayName,David Long,Gabe Davis,Nick Scott,Troy Hill,Isaiah McKenzie,Football,Jalen Ramsey,Taylor Rapp,Stefon Diggs,Dion Dawkins,...,Nick Mullens,Joshua Onujiogu,Chandler Brewer,Ke'Shawn Vaughn,Monty Rice,Tyus Bowser,DeSean Jackson,Binjimen Victor,Zack Baun,Jordan Howard
displayName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Football,0.888628,0.92028,0.940006,0.939229,0.961477,1.0,0.92212,0.95605,64.014252,0.855111,...,0.716023,0.695041,0.634022,0.873906,0.437874,0.911483,0.777738,0.658454,0.974758,0.418446
Jalen Ramsey,0.993726,0.9959,0.963713,0.991945,0.984848,0.92212,1.0,0.970283,95.665083,0.708677,...,0.734305,0.551311,0.508384,0.82801,0.356715,0.76028,0.914293,0.817386,0.90686,0.333522
Taylor Rapp,0.939577,0.95628,0.997938,0.991426,0.991991,0.95605,0.970283,1.0,83.551069,0.777474,...,0.831185,0.625223,0.590266,0.906807,0.420163,0.81883,0.796242,0.677312,0.935018,0.42631
Stefon Diggs,80.700713,85.748219,81.353919,98.931116,91.033254,64.014252,95.665083,83.551069,100.0,28.562945,...,39.489311,14.133017,7.304038,51.95962,0.296912,38.420428,53.325416,36.460808,59.56057,0.23753
Dion Dawkins,0.660768,0.717528,0.763968,0.739428,0.778398,0.855111,0.708677,0.777474,28.562945,1.0,...,0.57727,0.952155,0.929175,0.804347,0.75102,0.982731,0.556584,0.453748,0.839405,0.751679


In [None]:
#

In [20]:
sim_row.nlargest(5)

displayName
A.J. Brown        0.999644
Tyler Lockett     0.999420
Bradley Roby      0.998990
Michael Thomas    0.998926
Jahan Dotson      0.998781
Name: Stefon Diggs, dtype: float64

In [None]:
k=5
top_sim_and_dissim_df = np.zeros(len(all_players_sim_df), k*2)
