In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lshashpy3 import LSHash

In [2]:
df = pd.read_csv("./FIFA22_preprocessed.csv", header=0, index_col=0)
df.head(5)

Unnamed: 0,ID,Name,Age,Nationality,Club,Value,Preferred Foot,International Reputation,Weak Foot,Skill Moves,...,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,StandingTackle,SlidingTackle,Best Position,DefensiveAwareness
0,212198,Bruno Fernandes,26,Portugal,Manchester United,107.5,Right,3.0,3.0,4.0,...,78.0,66.0,87.0,90.0,91.0,87.0,73.0,65.0,CAM,72.0
1,209658,L. Goretzka,26,Germany,FC Bayern München,93.0,Right,4.0,4.0,3.0,...,81.0,86.0,85.0,84.0,60.0,82.0,85.0,77.0,CM,74.0
2,176580,L. Suárez,34,Uruguay,Atlético de Madrid,44.5,Right,5.0,4.0,3.0,...,87.0,41.0,91.0,84.0,83.0,87.0,45.0,38.0,ST,42.0
3,192985,K. De Bruyne,30,Belgium,Manchester City,125.5,Right,4.0,5.0,4.0,...,76.0,66.0,88.0,94.0,83.0,89.0,65.0,53.0,CM,68.0
4,224334,M. Acuña,29,Argentina,Sevilla FC,37.0,Left,2.0,3.0,4.0,...,84.0,79.0,81.0,82.0,76.0,87.0,84.0,82.0,LB,80.0


In [3]:
exclude_columns = ['ID', 'Name', 'Nationality', 'Club', 'Value', 'Preferred Foot', 'International Reputation', 'Work Rate', 'Body Type', 'Best Position']

In [4]:
k = 6
L = 5 
d = len(df.columns) - len(exclude_columns)

lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L,
    storage_config={ 'dict': None },
    matrices_filename='lsh_weights.npz',
    hashtable_filename='lsh_hash.npz',
    overwrite=True)


### First time running code:

In [5]:
for idx, row in df.iterrows():
    vector = row.drop(exclude_columns).values
    vector -= np.mean(vector) 
    lsh.index(vector, extra_data=str(idx)) # Storing idx in extra data for retreaval from df

In [6]:
lsh.save()

### Querying for recommedation system:

In [7]:
query_vector = df.drop(exclude_columns, axis=1).iloc[0].values
df.iloc[0].values

array([212198, 'Bruno Fernandes', 26, 'Portugal', 'Manchester United',
       107.5, 'Right', 3.0, 3.0, 4.0, 'High/ High', 'Unique', 179, 69,
       87.0, 83.0, 64.0, 91.0, 87.0, 83.0, 87.0, 87.0, 88.0, 87.0, 77.0,
       73.0, 80.0, 91.0, 79.0, 89.0, 73.0, 91.0, 70.0, 89.0, 78.0, 66.0,
       87.0, 90.0, 91.0, 87.0, 73.0, 65.0, 'CAM', 72.0], dtype=object)

In [8]:
num_results = 5
query_vector -= np.mean(query_vector)
query_results = lsh.query(query_vector, num_results=num_results, distance_func="cosine")
for ((vec, extra_data), distance) in query_results:
    row = int(extra_data)
    print(df.iloc[row].values)
    print("Cosine similarity:", 1 - distance)

[212198 'Bruno Fernandes' 26 'Portugal' 'Manchester United' 107.5 'Right'
 3.0 3.0 4.0 'High/ High' 'Unique' 179 69 87.0 83.0 64.0 91.0 87.0 83.0
 87.0 87.0 88.0 87.0 77.0 73.0 80.0 91.0 79.0 89.0 73.0 91.0 70.0 89.0
 78.0 66.0 87.0 90.0 91.0 87.0 73.0 65.0 'CAM' 72.0]
Cosine similarity: 1.0
[192985 'K. De Bruyne' 30 'Belgium' 'Manchester City' 125.5 'Right' 4.0
 5.0 4.0 'High/ High' 'Unique' 181 70 94.0 82.0 55.0 94.0 82.0 88.0 85.0
 83.0 93.0 91.0 76.0 76.0 79.0 91.0 78.0 91.0 63.0 89.0 74.0 91.0 76.0
 66.0 88.0 94.0 83.0 89.0 65.0 53.0 'CM' 68.0]
Cosine similarity: 0.9870598763520336
[214097 'B. Bourigeaud' 27 'France' 'Stade Rennais FC' 16.0 'Right' 2.0
 3.0 3.0 'High/ High' 'Lean (170-185)' 178 68 80.0 72.0 63.0 80.0 68.0
 77.0 79.0 81.0 78.0 78.0 71.0 70.0 72.0 77.0 74.0 77.0 72.0 84.0 68.0
 76.0 73.0 73.0 78.0 78.0 76.0 78.0 68.0 58.0 'CM' 63.0]
Cosine similarity: 0.9827024626895919
[204923 'M. Sabitzer' 27 'Austria' 'FC Bayern München' 48.0 'Right' 3.0
 4.0 3.0 'High/ High' 'No

In [9]:
def get_recommendation_by_name(name, depth = 20):
    rows = df[df['Name'] == name]
    if rows.empty:
        print("No player found with the given name.")
        return

    if len(rows) > 1:
        # If multiple players have the same name, pick the one with the highest value
        row = rows.loc[rows['Value'].idxmax()]
    else:
        row = rows.iloc[0]

    print(f'Data of {name}:\n{row}')

    query_vector = row.drop(exclude_columns).values
    query_vector -= np.mean(query_vector)

    query_results = lsh.query(query_vector, num_results=depth, distance_func="cosine")

    nearest_players = []
    for ((vec, extra_data), distance) in query_results:
        row = int(extra_data)
        player = df.iloc[row]
        nearest_players.append((player, distance))

    nearest_players.pop(0) # <-- itself

    cheapest_player = min(nearest_players, key=lambda x: x[0]['Value'])
    closest_player = nearest_players[0]

    print(f"Closest player:  ({closest_player[0]['ID']}) {closest_player[0]['Name']}", 
        "\tCosine similarity:", 1 - closest_player[1])

    print(f"Cheapest player: ({cheapest_player[0]['ID']}) {cheapest_player[0]['Name']}", 
        "\tValue: ", cheapest_player[0]['Value'], "mil",
        "\tCosine similarity:", 1 - cheapest_player[1])

In [10]:
get_recommendation_by_name("M. Acuña")

Data of M. Acuña:
ID                                    224334
Name                                M. Acuña
Age                                       29
Nationality                        Argentina
Club                              Sevilla FC
Value                                   37.0
Preferred Foot                          Left
International Reputation                 2.0
Weak Foot                                3.0
Skill Moves                              4.0
Work Rate                         High/ High
Body Type                   Stocky (170-185)
Height                                   172
Weight                                    69
Crossing                                87.0
Finishing                               66.0
HeadingAccuracy                         58.0
ShortPassing                            82.0
Volleys                                 68.0
Dribbling                               87.0
Curve                                   88.0
FKAccuracy                           

In [11]:
def estimate_value_by_name(name, column, depth=20):
    rows = df[df['Name'] == name]
    if rows.empty:
        print("No player found with the given name.")
        return

    if len(rows) > 1:
        # If multiple players have the same name, pick the one with the highest value
        row = rows.loc[rows['Value'].idxmax()]
    else:
        row = rows.iloc[0]

    print(f'Estimating {column} for {name}:\n{row}')

    query_vector = row.drop(exclude_columns).values
    query_vector -= np.mean(query_vector)

    query_results = lsh.query(query_vector, num_results=depth, distance_func="cosine")

    nearest_players = []
    for ((vec, extra_data), distance) in query_results:
        row = int(extra_data)
        player = df.iloc[row]
        nearest_players.append((player, distance))

    nearest_players.pop(0) # <-- itself

    nearest_players = []
    for ((vec, extra_data), distance) in query_results:
        row = int(extra_data)
        player = df.iloc[row]
        nearest_players.append((player, vec, distance))

    total_sim = 0.0
    weighted_sum = 0.0
    column_no = df.columns.get_loc(column)
    for (player, vec, distance) in nearest_players:
        weighted_sum += player[column] * (1 - distance)
        total_sim += (1 - distance)

    estimate = weighted_sum / total_sim
    print(f'Estimate of {column} for {name}: {estimate}')


In [12]:
estimate_value_by_name("M. Acuña", "ShotPower")

Estimating ShotPower for M. Acuña:
ID                                    224334
Name                                M. Acuña
Age                                       29
Nationality                        Argentina
Club                              Sevilla FC
Value                                   37.0
Preferred Foot                          Left
International Reputation                 2.0
Weak Foot                                3.0
Skill Moves                              4.0
Work Rate                         High/ High
Body Type                   Stocky (170-185)
Height                                   172
Weight                                    69
Crossing                                87.0
Finishing                               66.0
HeadingAccuracy                         58.0
ShortPassing                            82.0
Volleys                                 68.0
Dribbling                               87.0
Curve                                   88.0
FKAccuracy          

### Create Your Player

In [33]:
# Huzaifa
your_player = {
    'Age': 21,
    'Height': 207,
    'Weight': 80,
    'Acceleration': 73,
    'SprintSpeed': 73,
    'Stamina': 80,
    'Strength': 88,
    'Vision': 77
}

In [34]:
data = [your_player[stat_name] for stat_name in your_player]

data = np.array(data, dtype='float64')
data -= np.mean(data)

for i, stat_name in enumerate(your_player):
    your_player[stat_name] = data[i]

your_player

{'Age': -66.375,
 'Height': 119.625,
 'Weight': -7.375,
 'Acceleration': -14.375,
 'SprintSpeed': -14.375,
 'Stamina': -7.375,
 'Strength': 0.625,
 'Vision': -10.375}

In [35]:
query_vector = [
    your_player['Age'], 0, 0, your_player['Height'],
    your_player['Weight'], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    your_player['Acceleration'], your_player['SprintSpeed'],
    0, 0, 0, 0, 0, your_player['Stamina'], your_player['Strength'],
    0, 0, 0, 0, your_player['Vision'], 0, 0, 0, 0, 0  
    ]

In [36]:
name = "Huzaifa"
column = "Height"

query_results = lsh.query(query_vector, num_results=10, distance_func="cosine")

nearest_players = []
for ((vec, extra_data), distance) in query_results:
    row = int(extra_data)
    player = df.iloc[row]
    nearest_players.append((player, distance))

nearest_players.pop(0) # <-- itself

nearest_players = []
for ((vec, extra_data), distance) in query_results:
    row = int(extra_data)
    player = df.iloc[row]
    nearest_players.append((player, vec, distance))

total_sim = 0.0
weighted_sum = 0.0
column_no = df.columns.get_loc(column)
for (player, vec, distance) in nearest_players:
    weighted_sum += player[column] * (1 - distance)
    total_sim += (1 - distance)

estimate = weighted_sum / total_sim
print(f'Estimate of {column} for {name}: {estimate}')

Estimate of Height for Huzaifa: 185.4990358203076
