In [84]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
# Select numeric columns and apply Yeo-Johnson transformation
from sklearn.preprocessing import PowerTransformer, MinMaxScaler
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the dataset
file_path = 'megafile.csv'  # Replace with the path to your file
df = pd.read_csv(file_path)

# Display basic information about the dataset
print(df.info())

# Display the first few rows
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2847 entries, 0 to 2846
Columns: 114 entries, Player to Receiving PrgR
dtypes: float64(27), int64(81), object(6)
memory usage: 2.5+ MB
None
            Player Nation      Squad   Age  Born  Playing Time MP  \
0  Aaron Cresswell    ENG   West Ham  33.0  1989               11   
1     Aaron Hickey    SCO  Brentford  21.0  2002                9   
2    Aaron Malouda    FRA      Lille  17.0  2005                1   
3   Aaron Ramsdale    ENG    Arsenal  25.0  1998                6   
4     Aaron Ramsey    ENG    Burnley  20.0  2003               14   

   Playing Time Starts  Playing Time Min  Playing Time 90s  Performance Gls  \
0                    4               436               4.8                0   
1                    9               713               7.9                0   
2                    0                 1               0.0                0   
3                    6               540               6.0                0   


In [85]:
df = df.drop(columns = ['Playing Time MP','Playing Time Starts','Playing Time Min', '90s', 'Age', 'Born', 'Nation'])

divider_column = 'Playing Time 90s'
exclude_columns = [col for col in df.columns if '%' in col or '90' in col or df[col].dtype == 'object']
# Create a mask for rows where the divisor is zero
zero_divisor_mask = df[divider_column] == 0

# Perform the division for the remaining columns
for col in df.columns:
    if col not in exclude_columns and col != divider_column:
        # Set the values to 0 where the divisor is zero
        df.loc[zero_divisor_mask, col] = 0
        # Perform the division for the remaining values
        df.loc[~zero_divisor_mask, col] = df.loc[~zero_divisor_mask, col] / df.loc[~zero_divisor_mask, divider_column]

df = df[df['Playing Time 90s'] >= 10]
df = df.drop(columns = ['Playing Time 90s'])

  df.loc[~zero_divisor_mask, col] = df.loc[~zero_divisor_mask, col] / df.loc[~zero_divisor_mask, divider_column]
  df.loc[~zero_divisor_mask, col] = df.loc[~zero_divisor_mask, col] / df.loc[~zero_divisor_mask, divider_column]
  df.loc[~zero_divisor_mask, col] = df.loc[~zero_divisor_mask, col] / df.loc[~zero_divisor_mask, divider_column]
  df.loc[~zero_divisor_mask, col] = df.loc[~zero_divisor_mask, col] / df.loc[~zero_divisor_mask, divider_column]
  df.loc[~zero_divisor_mask, col] = df.loc[~zero_divisor_mask, col] / df.loc[~zero_divisor_mask, divider_column]
  df.loc[~zero_divisor_mask, col] = df.loc[~zero_divisor_mask, col] / df.loc[~zero_divisor_mask, divider_column]
  df.loc[~zero_divisor_mask, col] = df.loc[~zero_divisor_mask, col] / df.loc[~zero_divisor_mask, divider_column]
  df.loc[~zero_divisor_mask, col] = df.loc[~zero_divisor_mask, col] / df.loc[~zero_divisor_mask, divider_column]
  df.loc[~zero_divisor_mask, col] = df.loc[~zero_divisor_mask, col] / df.loc[~zero_divisor_mask,

In [86]:
corr_matrix = df.corr(numeric_only = True)

# Unstack the correlation matrix and reset index
corr_pairs = corr_matrix.unstack().reset_index()

# Rename columns for clarity
corr_pairs.columns = ['Variable_1', 'Variable_2', 'Correlation']

# Remove self-correlations and duplicate pairs
corr_pairs = corr_pairs[corr_pairs['Variable_1'] != corr_pairs['Variable_2']]
corr_pairs = corr_pairs.drop_duplicates(subset=['Correlation'])

# Sort by the absolute value of correlation
corr_pairs['Abs_Correlation'] = corr_pairs['Correlation'].abs()
corr_pairs = corr_pairs.sort_values(by='Abs_Correlation', ascending=False)

In [87]:
# Filter pairs with a correlation of 1 (excluding self-correlations)
perfect_corr_pairs = corr_pairs[(corr_pairs['Correlation'] >= 0.975) & (corr_pairs['Variable_1'] != corr_pairs['Variable_2'])]

# Identify variables to remove (you can choose based on your criteria)
variables_to_remove = set()
for _, row in perfect_corr_pairs.iterrows():
    var1, var2 = row['Variable_1'], row['Variable_2']
    if var1 not in variables_to_remove and var2 not in variables_to_remove:
        # Add one of the variables to the removal set
        variables_to_remove.add(var2)  # Here, we're arbitrarily choosing var2

# Remove the identified variables from the DataFrame
df = df.drop(columns=variables_to_remove)

In [88]:
# Fill null values in 'secondary_position' with values from 'primary_position'
df['Position_2'] = df['Position_2'].fillna(df['Position'])

# Handle missing values (if any)
df = df.dropna()

# Separate numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns

# Apply PowerTransformer to numeric columns
pt = PowerTransformer(method='yeo-johnson')
df[numeric_cols] = pt.fit_transform(df[numeric_cols])

# Standardize the data
scaler = MinMaxScaler()
scaled_numeric_data = scaler.fit_transform(df[numeric_cols])

# Convert scaled numeric data back to DataFrame
scaled_numeric_df = pd.DataFrame(scaled_numeric_data, columns=numeric_cols)

print(scaled_numeric_df.head())

   Performance Gls  Performance Ast  Performance G+A  Performance PK  \
0         0.000000         0.456064         0.267864         0.00000   
1         0.000000         0.328692         0.184700         0.00000   
2         0.417642         0.000000         0.266754         0.00000   
3         0.000000         0.000000         0.000000         0.00000   
4         0.739299         0.785990         0.773768         0.99966   

   Performance PKatt  Performance CrdY  Performance CrdR  Expected xG  \
0           0.000000          0.403633          0.000000     0.031677   
1           0.000000          0.490185          0.999129     0.079656   
2           0.000000          0.402053          0.000000     0.390857   
3           0.000000          0.643249          0.000000     0.115899   
4           0.998207          0.363606          0.000000     0.792774   

   Expected xAG  Expected npxG+xAG  ...  Carries Carries  Carries TotDist  \
0      0.374234           0.239858  ...         0.4

In [102]:
from sklearn.neural_network import BernoulliRBM

# Parameters for RBM
n_components = 20  # Number of hidden units (dimensionality of latent space)
learning_rate = 0.01
n_iter = 500
batch_size = 256

rbm = BernoulliRBM(n_components=n_components, learning_rate=learning_rate, n_iter=n_iter, batch_size=batch_size, random_state=42)
rbm.fit(scaled_numeric_df)

# Transform data using RBM
rbm_latent_space = rbm.transform(scaled_numeric_df)

In [103]:
# Apply k-means clustering to the latent space
kmeans = KMeans(n_clusters=22, random_state=42)
clusters = kmeans.fit_predict(rbm_latent_space)

# Calculate silhouette score
silhouette_avg = silhouette_score(rbm_latent_space, clusters)
print(f'Silhouette Score: {silhouette_avg}')


Silhouette Score: 0.5438521962598489


In [105]:
def find_closest_players(df, latent_space, player_name, top_n=5):
    # Find the player's index and positions
    player_row = df[df['Player'] == player_name].iloc[0]
    player_index = player_row.name
    player_positions = [player_row['Position'], player_row['Position_2']]

    # Subset the dataframe and latent space for players who play in one of the named player's positions
    position_filter = df['Position'].isin(player_positions) | df['Position_2'].isin(player_positions)
    filtered_df = df[position_filter]
    filtered_latent_space = latent_space[position_filter]

    # Get the latent space of the input player
    player_latent = latent_space[player_index]

    # Calculate Euclidean distances from the input player within the filtered latent space
    distances = np.linalg.norm(filtered_latent_space - player_latent, axis=1)

    # Get the indices of the closest players in the filtered dataframe
    closest_indices = np.argsort(distances)[1:top_n+1]  # Exclude the player itself

    return filtered_df.iloc[closest_indices][['Player', 'Position', 'Position_2']]


# Combine scaled numeric data with non-numeric columns
df = pd.concat([scaled_numeric_df, df[non_numeric_cols].reset_index(drop=True)], axis=1)
# Example usage
player_name = input("Please enter the name of the player you want to find similar to (all accents included): ")
closest_matches = find_closest_players(df, rbm_latent_space, player_name)
print(closest_matches)



               Player    Position  Position_2
1430  Stijn Spierings  Midfielder  Midfielder
1215      Oriol Romeu  Midfielder  Midfielder
710      Jonas Martin  Midfielder  Midfielder
545      Granit Xhaka  Midfielder  Midfielder
1284  Ramy Bensebaini    Defender    Defender


In [92]:
weights = rbm.components_ 

# Sum absolute weights for each feature across all hidden units
feature_importance = np.sum(np.abs(weights), axis=0)

# Rank features by importance
sorted_features = np.argsort(feature_importance)[::-1]

# Display top N important features
top_n = 10
for i in range(top_n):
    print(f"Feature {sorted_features[i]}: {feature_importance[sorted_features[i]]}")


Feature 42: 4.1216906555716895
Feature 3: 3.6649656035198652
Feature 4: 3.629080902932135
Feature 41: 3.433494380238551
Feature 40: 3.3561638499856024
Feature 39: 3.2068862879399256
Feature 6: 3.006624008233659
Feature 1: 2.414337914797591
Feature 30: 2.3021878572568726
Feature 77: 2.271121455896836
