This notebook carries functions necessary to perform CDkM



## packages

In [None]:
import numpy as np
import pandas as pd
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
from sklearn.decomposition import FactorAnalysis
from sklearn.cluster import KMeans
## plotting packages
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.validators.scatter.marker import SymbolValidator
import plotly.figure_factory as ff

import matplotlib.pyplot as plt
import matplotlib.cm as cm
get_ipython().run_line_magic('matplotlib', 'inline')

plt.rcParams["figure.figsize"] = (20,10)

## other helpful packages
from itertools import combinations, permutations, combinations_with_replacement
from collections import Counter
import networkx as nx
!pip install python-louvain
import community.community_louvain as cl

## import these dictionaries (find on Github (Translation/Helper), store in directory)
# !pip install pid2pos_bref2nba_nba2bref_pid2name_name2pid

import decimal

def round_down(value, decimals):
    with decimal.localcontext() as ctx:
        d = decimal.Decimal(value)
        ctx.rounding = decimal.ROUND_DOWN
        return round(d, decimals)

import pickle
import math
from scipy import stats

from google.colab import drive


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
'''

import sys

# Mount my google drive
drive.mount('/GoogleDrive', force_remount=True)

# Add helper lib to path
sys.path.append('/GoogleDrive/MyDrive/CSE6242/helpers')
If using colab, run above
'''
from pid2pos_bref2nba_nba2bref_pid2name_name2pid import *

Mounted at /GoogleDrive


### read in dataframes

User will need to create their own directory / folder containing dfs (stored as csv on Github)

In [None]:
allDFs = [
   'Augmented_MasterClutch','MasterDefense', 'MasterRebound', 'MasterPassing', 'MasterScoring' ,'MasterMisc',
]

folder = '../data/MasterData'

for dfStr in allDFs:
    vars()[dfStr] = pd.read_csv(f'{folder}/{dfStr}.csv')


### begin CDkM

In [None]:
import pdb

### helper ###
# drop unneeded/redundant columns
colDrop = [
    'Unnamed: 0', 'GP', 'Season' , 'pidSzn'
]

def col2str(item):
        return str(item)

### pre-processing ###
def scale_and_count_DFs(DFs, seasons):
    """
    INPUT
    DFs = allDFs
    season = specified by user

    OUTPUT
    Counter(totAppear): dictionary; counts number of times players appear in same DF; used as denominator when building network arc weights.
    idx_DFs: placeholder DF; contains general items of interest for tracking and reporting
    scaled_DFs: returns Standard Scaler'ed version of original DFs for PCA and other analysis
    """
    # standardize data
    scaler = StandardScaler()

    idx_DFs = []
    scaled_DFs = []
    totAppear = []
    for dfStr in DFs:
        print(dfStr)
        vars()[dfStr] = pd.read_csv(f'{folder}/{dfStr}.csv')
        temp = vars()[dfStr].copy()
        allYears = []
        for s in seasons:
            temp0 = temp[(temp.Season == s)].copy()
            allYears.append(temp0)
        temp = pd.concat(allYears, ignore_index=True).fillna(0)

        # create idx tracker
        vars()[dfStr + '_idx'] = temp.loc[:,['GP', 'Season', 'pidSzn']]
        idx_DFs.append(vars()[dfStr + '_idx'])

        # count how many times players appear to scale counts later
        temp_ = temp.sort_values(by='pidSzn').pidSzn
        comb = list(combinations(temp_, 2))
        totAppear.extend(comb)

        # subset into kept columns
        temp = temp.drop(columns=colDrop)

        # create scaled df
        vars()[dfStr + '_scaled'] = scaler.fit_transform(temp)
        scaled_DFs.append(vars()[dfStr + '_scaled'])
    return Counter(totAppear), idx_DFs, scaled_DFs

def pca_tf(DFs, scaled_DFs, var2keep):
    """
    INPUT
    var2keep: scaler; variance to keep for PCA

    OUTPUT
    allPCA: PCA transformed DFs
    """
    ## pca first
    allPCA = []
    comp2keep = {}
    running = 0
    for idx, dfStr in enumerate(DFs):
        # get number of components to keep
        temp = scaled_DFs[idx]
        pca = PCA().fit(temp)
        comp2keep[dfStr + '_scaled'] = np.where(np.cumsum(pca.explained_variance_ratio_)>var2keep)[0][0]
        running += np.where(np.cumsum(pca.explained_variance_ratio_)>var2keep)[0][0]

    for k,v in comp2keep.items():
        print('For {}, {} components were kept.'.format(k,v))

    # then transofrm scaled dfs into pca dfs
    for idx, dfStr in enumerate(DFs):
        newStr = dfStr + '_scaled'
        temp = scaled_DFs[idx]
        best_k = comp2keep[newStr]
        pca_tf = PCA(n_components=best_k)
        dfPca = pca_tf.fit_transform(temp)
        allPCA.append(dfPca)
    return allPCA

### CDkM algorithm ###
## now begin clustering for each df, keep track of connections in a dictionary

# create results dataframe after clustering using kMeans
def clusterResults(idx_DFs, scaled_DFs, n_list, pid2pos, clusterMethod='kmeans'):
    """
    INPUT
    n_list: range of k's to use; ex: [[i]*numdf for i in range(2,151)]

    OUTPUT
    allResults: list of resulting cluster DFs (one for each master DF); cols = ['GP', 'szn', 'pid', 'cluster', 'df', 'pos']
    """
    allResults = []
    for i in range(len(idx_DFs)):
        df = scaled_DFs[i]
        kmeans = KMeans(n_clusters=n_list[i], random_state=13).fit(df)
        labels_ = kmeans.labels_
        results = pd.DataFrame(columns=['GP', 'szn', 'pid', 'cluster', 'pos', 'df'])
        results[['GP', 'szn', 'pid']] = idx_DFs[i]
        results['cluster'] = labels_
        results['df'] = allDFs[i]
        for idx, row in results.iterrows():
            pl = str(row.pid)
            if pl in pid2pos:
                results.loc[idx,'pos'] = pid2pos[pl]
            else:
                results.loc[idx,'pos'] = 'nan'
        allResults.append(results)
    return allResults

# count how often each pair occurs in same cluster
def createCountDict(allResults, pid2name):
    """
    OUTPUT
    countDict: key: player pair; value: raw # times pair appear in same cluster
    countDictMatch: key: player pair; value: which master DF categories they match in for post analysis
    """
    countDict = {}
    countDictMatch = {}
    for ix,df in enumerate(allResults):
        df = df.sort_values(by='pid')
        for cl in df.cluster.unique():
            temp = df[(df.cluster==cl)]
            nodes = temp['pid'].apply(col2str)
            comb = list(combinations(nodes,2))
            for pair in comb:
                if int(pair[0].split('_')[0]) in pid2name and int(pair[1].split('_')[0]) in pid2name:
                    if pid2name[int(pair[0].split('_')[0])] == pid2name[int(pair[1].split('_')[0])]:
                        continue
                    if pair in countDict:
                        countDict[pair] += 1
                        countDictMatch[pair].append(allDFs[ix])
                    else:
                        countDict[pair] = 1
                        countDictMatch[pair] = [allDFs[ix]]
    return countDict, countDictMatch

# different ways for determining arc weights (for testing, user can try out different schemes)
def calcVals(countDict, numAppear, destStr):
    """
    OUTPUT
    weightDF: write scaled arc-weight results to csv

    RN numAppear is incorrect
    """
    with open(destStr, 'w') as f:
        f.write('player1,player2,val_3,val_6,raw,total\n')
        for key in countDict.keys():
            if numAppear[key] == 0:
                continue

            # zero or one
            v1 = int(countDict[key]/numAppear[key])

            # round to two decimals
            v2 = float(round_down(countDict[key]/numAppear[key], 2))

            v7 = countDict[key]

            if v2 <= 0.2:
                v3 = 0
            elif v2 <= 0.4:#0.35:
                v3 = 1
            elif v2 <= 0.6:#0.5:
                v3 = 2
            elif v2 <= 0.8:#0.7:
                v3 = 3
#             elif v2 <= 0.85:
#                 v3 = 4
            else:
                v3 = 4

            v4 = float(round_down(countDict[key]/numAppear[key], 2))


            if v4 <= 0.25:
                v6 = 0
            elif v4 <= 0.5:
                v6 = 1
            elif v4 <= 0.75:
                v6 = 2
            else:
                v6 = 3

            f.write('{},{},{},{},{},{}\n'.format(key[0],key[1],v3,v6,v4,v7))
        f.close()
        return pd.read_csv(destStr)

# using weights found, perform community detection
def getModularity(nodeDF, valDF):
    """
    INPUT
    nodeDF: from weightDF, players (2 columns)
    valDF: from weightDF, value chosen (1 column)

    OUTPUT
    partition: output from Louvain algorithm
    G: create graph using networkx
    mod: scalar; modularity of Louvain algorithm results
    numGrps: scalar; |partitions|
    """
    # create df
    graphDF = pd.concat([nodeDF, valDF], axis=1)
    graphDF.columns = ['node1', 'node2', 'value']
    # build graph
    G = nx.Graph()
    for idx,row in graphDF.iterrows():
        G.add_edge(row.node1, row.node2, weight=row.value)
    # partition into clusters
    partition = cl.best_partition(G,randomize=False, random_state=13)
    uniqueGrp = set()
    for key,val in partition.items():
        uniqueGrp.add(val)
    numGrps = len(uniqueGrp)

    # pdb.set_trace()

    # get modularity
    mod = cl.modularity(partition, G)
    return partition, G, mod, numGrps




In [None]:

### execution ###

## run experiments
def experiment(allDFs, szn, n_list, pid2pos_bref, pid2name, tempStr, destStr, valcol, var2keep, returnPartition=False):
    numAppear, idx_DFs, scaled_DFs = scale_and_count_DFs(allDFs, szn)
    pcaDFs = pca_tf(allDFs, scaled_DFs, var2keep)
    allResults = clusterResults(idx_DFs, pcaDFs, n_list, pid2pos_bref)
    countDict, countDictMatch = createCountDict(allResults, pid2name)
    # dict2DF(countDictMatch, pidszn2name, tempStr)
    valDF = calcVals(countDict, numAppear, destStr)
    nodeDF = valDF[['player1', 'player2']]
    partition, G, mod, numGrp = getModularity(nodeDF, valDF[valcol])

    if returnPartition:
        return partition, G, mod, numGrp
    else:
        return mod, numGrp

## Run experiments and save experDF to csv (can change to function to create experDF)
Yr = ['14_15', '15_16', '16_17', '17_18', '18_19', '19_20'] ## change as wanted
seasonz = [['2014-15'], ['2015-16'], ['2016-17'], ['2017-18'], ['2018-19'], ['2019-20']] ## change as wanted
valCols = ['raw'] ## change as wanted
numdf = len(allDFs)
n_lists = [[i]*numdf for i in range(35, 50)] ## change as wanted
## user define:
results_folder = '../output'
name_of_exper = 'modularity'
where2save = results_folder + '/res_by_k'
with open(f'{results_folder}/{name_of_exper}.csv', 'w') as f:
    f.write('szn,k,col,mod,numPartitions\n')
    for ix,szn in enumerate(seasonz):
        for i,n_list in enumerate(n_lists):
            k = n_list[0]
            for col in valCols:
                tempStr = '{}/{}_{}_{}.csv'.format(where2save,k,col,Yr[ix])
                dictStr = '{}/dict{}_{}_{}.txt'.format(where2save,k,col,Yr[ix])
                partition, G, mod, numGrp = experiment(allDFs, szn, n_list, pid2pos_bref, pid2name, dictStr, tempStr, col, var2keep=0.99, returnPartition=True)
                # pdb.set_trace();
                # mod, numGrp = experiment(allDFs, szn, n_list, pid2pos_bref, pid2name, dictStr, tempStr, col, var2keep=0.99, returnPartition=False)
                f.write('{},{},{},{},{}\n'.format(Yr[ix],k,col,mod,numGrp))

f.close()


Augmented_MasterClutch
MasterDefense
MasterRebound
MasterPassing
MasterScoring
MasterMisc
For Augmented_MasterClutch_scaled, 24 components were kept.
For MasterDefense_scaled, 7 components were kept.
For MasterRebound_scaled, 11 components were kept.
For MasterPassing_scaled, 29 components were kept.
For MasterScoring_scaled, 23 components were kept.
For MasterMisc_scaled, 6 components were kept.




None
> [0;32m<ipython-input-12-b8a047e9bc4d>[0m(40)[0;36m<cell line: 29>[0;34m()[0m
[0;32m     38 [0;31m                [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m;[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     39 [0;31m                [0;31m# mod, numGrp = experiment(allDFs, szn, n_list, pid2pos_bref, pid2name, dictStr, tempStr, col, var2keep=0.99, returnPartition=False)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 40 [0;31m                [0mf[0m[0;34m.[0m[0mwrite[0m[0;34m([0m[0;34m'{},{},{},{},{}\n'[0m[0;34m.[0m[0mformat[0m[0;34m([0m[0mYr[0m[0;34m[[0m[0mix[0m[0;34m][0m[0;34m,[0m[0mk[0m[0;34m,[0m[0mcol[0m[0;34m,[0m[0mmod[0m[0;34m,[0m[0mnumGrp[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     41 [0;31m[0;34m[0m[0m
[0m[0;32m     42 [0;31m[0mf[0m[0;34m.[0m[0mclose[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
{'101106_2014-15': 0, '101133_2014-15': 1, '200757_2014-1