In [None]:
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        prev = 0
        loss = model.get_latest_training_loss()
        change_loss = loss - prev
        prev = loss
        print('Loss after epoch {}: {}'.format(self.epoch, change_loss))
        self.epoch += 1

Define the corpus, cleaning for punctuation, tags and multiple whitespaces
Shuffe the corpus
Yield each sentence of the shuffled corpus

In [None]:
from gensim.test.utils import datapath
import gensim.parsing.preprocessing as gsp
import random

class EventCorpus:
	"""An iterator that yields sentences (lists of str) for pseudo event corpus."""
	def __iter__(self):
		sentences = []
		corpus_path = datapath('/home/vlnsha004/CSC2005Z/lpe_soccerevents/data/eventcorpus8.txt')
		
		for line in open(corpus_path):

			# remove all single quotation marks
			line = line.replace("'", "") # Erroneous quotation marks
			line = line.replace('6', '5')
			line = line.replace('"', '')
			CUSTOM_FILTERS = [gsp.strip_tags, gsp.strip_multiple_whitespaces]
			sentences.append(gsp.preprocess_string(line, CUSTOM_FILTERS))
			yield gsp.preprocess_string(line, CUSTOM_FILTERS)
        
		random.shuffle(sentences)  # Shuffle the sentences

		for sentence in sentences:
			yield sentence

Model training
Current hyperparameters:
window = 3
epochs = 35
vs = 500
mc = 1 # min_count
negative = 20, Negative used for the sample size used for negative sampling

In [None]:
from gensim.models import Word2Vec

# Train the model on the new data
# model.train(sequences, total_examples=model.corpus_count, epochs=model.epochs, compute_loss = True, callbacks=[callback()])

sequences = EventCorpus()

window = 10
epochs = 60
vs = 800
mc = 1 # min_count
ns = 20 # negative sampling
subsample = 1e-5
model = Word2Vec(sentences=sequences, epochs = epochs, sample=subsample, vector_size=vs, window=window, min_count=mc, workers=10, compute_loss = True, negative=ns, callbacks=[callback()]) #Obtain params from football2vec text
model.save(f"/home/vlnsha004/CSC2005Z/lpe_soccerevents/models/anon/Seq2vec_{epochs}_{vs}_{window}_{mc}_{ns}_shuffled_sample.model") # First model trained on full corpus

In [None]:
sequences = EventCorpus()
window = 3
epochs = 30
vs = 500
mc = 1 # min_count
ns = 5 # negative sampling
for i in range(6, 18, 6):
    window = i
    for j in range(5, 25, 15):
        ns = j
        model = Word2Vec(sentences=sequences, epochs = epochs, vector_size=vs, window=window, min_count=mc, workers=8, compute_loss = True, negative=ns, callbacks=[callback()])
        model.save(f"/home/vlnsha004/CSC2005Z/player2vec/models/anon/Seq2vec_{epochs}_{vs}_{window}_{mc}_{ns}_unshuffled.model") # First model trained on full corpus

In [None]:
model.save(f"/home/vlnsha004/CSC2005Z/lpe_soccerevents/models/anon/Seq2vec_{epochs}_{vs}_{window}_{mc}_{ns}_unshuffled.model") # First model trained on full corpus

In [None]:
from gensim.models import Word2Vec

model_path = "/home/vlnsha004/CSC2005Z/lpe_soccerevents/models/anon/Seq2vec_30_500_3_1_20_shuffled.model"
model = Word2Vec.load(model_path)

In [None]:
import pandas as pd

fifa =  pd.read_csv('/home/vlnsha004/CSC2005Z/lpe_soccerevents/data/male_players.csv', delimiter=',', low_memory=False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [None]:
import dask.dataframe as dd
import multiprocessing as mp

fifa = fifa.query('fifa_version > 23.0')
fifa = fifa.sort_values(['overall', 'player_positions'], ascending = False)
fifa['long_name'] = fifa['long_name'].str.replace(' ', '_')

fifa = fifa.reset_index(drop=True, inplace=False) # reset the indices
fifa = fifa.rename_axis("index", axis="columns")
trimmed_fifa = fifa[["long_name", "player_positions", "player_tags", "player_traits"]]
take_sample_400 = trimmed_fifa.sample(n=400, random_state=1, replace=False)
top400 = dd.from_pandas(take_sample_400, npartitions=mp.cpu_count()) #Take 400 of the top rated players

In [None]:
def compare(baller, sim_player, pd_fifa, trials, type) -> int:
	'''type -> compare positions or traits or tags'''
	'''type: string representing column name '"player_positions", "player_tags", "player_traits"'''

	baller_positions = (pd_fifa.loc[pd_fifa['long_name'] == baller, type]).unique().tolist()
	if len(baller_positions) == 0 or str(baller_positions[0]) == 'nan':
		print(f'{baller} has no player tags in FIFA\n')
		trials = 1
		print(f'Trials = {trials}\n')
		return [0, trials]
	baller_set = set(baller_positions[0].split(', '))
	# Return 1 if sim_player plays any of the baller's positions, 0 otherwise
	# Find rows where sim_player is a subset of long_name efficiently with vectorization
	subset_mask = pd_fifa['long_name'].str.contains(sim_player[0], case=False, regex=False)  # Case-insensitive
	# Assuming there's only one matching record (use .iloc[0] for the first)
	matching_row = pd_fifa[subset_mask]
	retrieval_w2v_positions = matching_row[type].unique().tolist()
	if len(retrieval_w2v_positions) == 0 or 'nan' in (str(retrieval_w2v_positions)):
		print(f'{sim_player[0]} not found or has no traits in FIFA\n')
		print(f'Trials = {trials}\n')
		return [0, trials]
	elif len(retrieval_w2v_positions) > 1:
		print(f'{sim_player[0]}: {retrieval_w2v_positions}')
		retrieval_set = set(retrieval_w2v_positions[1].split(', '))
		print(f'{baller}: {baller_positions}')    
		print(f'similar player set: {retrieval_set}')
		print(f'baller set: {baller_set}')
		print(f'intersection: {retrieval_set & baller_set}')
		if (retrieval_set & baller_set):
			# print('SIMILAR PLAYER FOUND\n')
			trials += 1
			return [1, trials]
		else:
			# print('No match\n')
			trials += 1
			return [0, trials]
	else:
		print(f'{sim_player[0]}: {retrieval_w2v_positions}')
		retrieval_set = set(retrieval_w2v_positions[0].split(', '))
		print(f'{baller}: {baller_positions}')    
		print(f'similar player set: {retrieval_set}')
		print(f'baller set: {baller_set}')
		print(f'intersection: {retrieval_set & baller_set}')
		if (retrieval_set & baller_set):
			print('SIMILAR PLAYER FOUND\n')
			trials += 1
			return [1, trials]
		else:
			print('No match\n')
			trials += 1
			return [0, trials]
	

In [None]:
import random
import numpy as np

def get_pakvector(top400, base=False, type="player_positions", model=model):

	similar_players = {}

	for player in top400['long_name']:
		# Create dictionary with {"Eriksen": ["Davies", "Mignolet", "Ronaldo"], "Johnathan": ["Bakambu", "Cedric"]}
		try: 
			if base:
				all_players = model.wv.index_to_key
				random_players = random.sample(all_players, 200)
				similar_players[player] = [(other_player, model.wv.similarity(player, other_player)) for other_player in random_players]
			else:
				similar_players[player] = model.wv.most_similar(player, topn=60)
		except KeyError:
			continue
		
	# Create an array to store the precision at 20 for each player in selected_columns, retrieving similar players by the model
	pak_store = []
	for baller, similar in similar_players.items(): # Loop through the key-value pairs
		counter = 0
		trials = 0
		print(f'{baller}: {similar}')
		baller_positions = (trimmed_fifa.loc[trimmed_fifa['long_name'] == baller, type]).unique().tolist()
		if len(baller_positions) == 0 or 'nan' in (str(baller_positions)):
			print(f'{baller} has no player tags in FIFA\n')
			continue
		# Begin trials
		for sim_player in similar: # For each player in similar set:	
			[match, new_trial] = compare(baller, sim_player, trimmed_fifa, trials, type)
			counter += match
			trials = new_trial
			if trials == 10:
				prop = counter/trials
				break
		# If we do not have 10 trials, do not include in the precision at 10 calculation
		# Have gone through all retrievals and found no viable comparisons
		if trials < 10:
			print(f'Less than 10 data points for similar players to {baller} in FIFA\n')
			continue

		print(f'{baller}: Correct:{counter}, Trials:{trials}, Percent: {prop}\n')
		pak_store.append(prop) # Store the precision at 20 for each player
	ave_precision_k = round(np.mean(pak_store) * 100, 2)
	return ave_precision_k

In [None]:
print(get_pakvector(top400, base=False, type="player_tags", model = model))

In [None]:
import os
from gensim.models import Word2Vec

directory = "/home/vlnsha004/CSC2005Z/lpe_soccerevents/models/anon"
types = ["player_positions", "player_tags", "player_traits"]
models = {}

for filename in os.listdir(directory):
    if filename.endswith(".model"):
        model_path = os.path.join(directory, filename)
        model_name = filename[:-6]  # remove ".model" from the filename
        models[model_name] = Word2Vec.load(model_path)
        

latex_table = r"""
\begin{tabular}{|l||c|c||c|c|c||c|}
\hline
Model    &   Negative          & Anonymized/    & Shuffled/    & \multicolumn{3}{c||}{Accuracy [\%]} \\
         &   Sample  & Unanonymized   & Unshuffled   & \multicolumn{3}{c||}{Precision at K} \\
\hline
         &                             &       &    & Player Positions      &  Player Tags    &  Player Traits               \\
\hline
"""

# Add the random/base set of results
latex_table += f"Randomly drawn set of retrievals & N/A & N/A & N/A & "
for type in types:
    pak = get_pakvector(top400, base = True, type=type)
    latex_table += f"{pak} & "
latex_table = latex_table[:-2]  # remove the last "& "
latex_table += r"\\ \hline"  # add a new row

# Define the sorting key
def sorting_key(item):
    model_name, model = item
    shuffled = "shuffled" in model_name
    return (shuffled, model.negative)

# Sort the models
sorted_models = sorted(models.items(), key=sorting_key)

for model_name, model in sorted_models:
    anonymized = "Anon" if "anon" in model_name else "Unanon"
    shuffled = "Shuffled" if "shuffled" in model_name else "Unshuffled"
    latex_table += f"{model_name} & {model.negative} & {anonymized} & {shuffled} & "
    for type in types:
        pak = get_pakvector(top400, type=type, model = model)
        latex_table += f"{pak} & "
    latex_table = latex_table[:-2]  # remove the last "& "
    latex_table += r"\\ + \n+ \hline + \n"  # add a new row

latex_table += r"\end{tabular}"

print(latex_table)

In [None]:
# Evaluation and retrievals:

word = 'Anderson_Luís_de_Souza'
try:
	print(model.wv.most_similar(word, topn = 45, ))
except KeyError:
    print(f"The word {word} does not appear in current model") 



In [None]:
pairs = [
    ('Diego_Armando_Maradona', word),  # a minivan is a kind of car
    ('long_pass', word),   # a minivan is a kind of car
    ('High_Pass', word),   # still a wheeled vehicle
    ('short_pass', word),  # ok, no wheels, but still a vehicle
    ('Ronaldo_de_Assis_Moreira', word),    # ... and so on
    ('Shot_Goal', word),
]
for w1, w2 in pairs:
    print(f'{w1}\t{w2}\t{model.wv.similarity(w1, w2)}')
    # print(f'{w1}\t{w2}\t{model300.wv.similarity(w1.lower(), w2.lower())}')

In [None]:
# Get the embeddings for three words
word2_vector = model.wv['Kylian_Mbappé_Lottin']
word3_vector = model.wv['Low_Pass']
word1_vector = model.wv['Interception']
word4_vector = model.wv['(4,4)']

# Perform vector operations
result_vector = word1_vector + word3_vector + word2_vector
# Get the words most similar to the result vector
most_similar_words = model.wv.most_similar(positive=[result_vector])

# Print the most similar words
for word, similarity in most_similar_words:
	print(word, similarity)

In [None]:
# Get vocab of model

w2v_vocabulary = model.wv.key_to_index
print((w2v_vocabulary))