In [1]:
import pandas as pd
import numpy as np
import json
import logging
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from sentence_transformers import SentenceTransformer
import re

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Loading Data

- getting a snippet of what some of the data looks like

In [2]:
apps = pd.read_csv("data/appearances.csv")
club_games = pd.read_csv("data/club_games.csv")
clubs = pd.read_csv("data/clubs.csv")
comps = pd.read_csv("data/competitions.csv")
game_events = pd.read_csv("data/game_events.csv")
players = pd.read_csv("data/players.csv")

game_lineups = pd.read_csv("data/game_lineups.csv")
players_eval = pd.read_csv("data/player_valuations.csv")
transf = pd.read_csv("data/transfers.csv")

In [3]:
apps.head()

Unnamed: 0,appearance_id,game_id,player_id,player_club_id,player_current_club_id,date,player_name,competition_id,yellow_cards,red_cards,goals,assists,minutes_played
0,2231978_38004,2231978,38004,853,235,2012-07-03,Aurélien Joachim,CLQ,0,0,2,0,90
1,2233748_79232,2233748,79232,8841,2698,2012-07-05,Ruslan Abyshov,ELQ,0,0,0,0,90
2,2234413_42792,2234413,42792,6251,465,2012-07-05,Sander Puri,ELQ,0,0,0,0,45
3,2234418_73333,2234418,73333,1274,6646,2012-07-05,Vegar Hedenstad,ELQ,0,0,0,0,90
4,2234421_122011,2234421,122011,195,3008,2012-07-05,Markus Henriksen,ELQ,0,0,0,1,90


In [4]:
club_games.head()

Unnamed: 0,game_id,club_id,own_goals,own_position,own_manager_name,opponent_id,opponent_goals,opponent_position,opponent_manager_name,hosting,is_win
0,2320450,1468.0,0.0,,Holger Bachthaler,24.0,2.0,,Armin Veh,Home,0
1,2320454,222.0,0.0,,Volkan Uluc,79.0,2.0,,Bruno Labbadia,Home,0
2,2320460,1.0,3.0,,Jürgen Luginger,86.0,1.0,,Robin Dutt,Home,1
3,2320472,2036.0,4.0,,Frank Schmidt,72.0,5.0,,Alexander Schmidt,Home,0
4,2321027,33.0,3.0,8.0,Jens Keller,41.0,3.0,9.0,Thorsten Fink,Home,0


In [5]:
clubs.head()

Unnamed: 0,club_id,club_code,name,domestic_competition_id,total_market_value,squad_size,average_age,foreigners_number,foreigners_percentage,national_team_players,stadium_name,stadium_seats,net_transfer_record,coach_name,last_season,filename,url
0,105,sv-darmstadt-98,SV Darmstadt 98,L1,,27,25.6,13,48.1,1,Merck-Stadion am Böllenfalltor,17810,+€3.05m,,2023,../data/raw/transfermarkt-scraper/2023/clubs.j...,https://www.transfermarkt.co.uk/sv-darmstadt-9...
1,11127,ural-ekaterinburg,Ural Yekaterinburg,RU1,,30,26.5,11,36.7,3,Yekaterinburg Arena,23000,+€880k,,2023,../data/raw/transfermarkt-scraper/2023/clubs.j...,https://www.transfermarkt.co.uk/ural-ekaterinb...
2,114,besiktas-istanbul,Beşiktaş Jimnastik Kulübü,TR1,,30,26.6,15,50.0,8,Beşiktaş Park,42445,€-25.26m,,2024,../data/raw/transfermarkt-scraper/2024/clubs.j...,https://www.transfermarkt.co.uk/besiktas-istan...
3,12,as-rom,Associazione Sportiva Roma,IT1,,26,26.3,18,69.2,17,Olimpico di Roma,70634,€-76.90m,,2024,../data/raw/transfermarkt-scraper/2024/clubs.j...,https://www.transfermarkt.co.uk/as-rom/startse...
4,148,tottenham-hotspur,Tottenham Hotspur Football Club,GB1,,30,25.5,21,70.0,18,Tottenham Hotspur Stadium,62850,€-120.05m,,2024,../data/raw/transfermarkt-scraper/2024/clubs.j...,https://www.transfermarkt.co.uk/tottenham-hots...


In [6]:
comps.head()

Unnamed: 0,competition_id,competition_code,name,sub_type,type,country_id,country_name,domestic_league_code,confederation,url,is_major_national_league
0,CIT,italy-cup,italy-cup,domestic_cup,domestic_cup,75,Italy,IT1,europa,https://www.transfermarkt.co.uk/italy-cup/star...,False
1,NLSC,johan-cruijff-schaal,johan-cruijff-schaal,domestic_super_cup,other,122,Netherlands,NL1,europa,https://www.transfermarkt.co.uk/johan-cruijff-...,False
2,GRP,kypello-elladas,kypello-elladas,domestic_cup,domestic_cup,56,Greece,GR1,europa,https://www.transfermarkt.co.uk/kypello-ellada...,False
3,POSU,supertaca-candido-de-oliveira,supertaca-candido-de-oliveira,domestic_super_cup,other,136,Portugal,PO1,europa,https://www.transfermarkt.co.uk/supertaca-cand...,False
4,RUSS,russian-super-cup,russian-super-cup,domestic_super_cup,other,141,Russia,RU1,europa,https://www.transfermarkt.co.uk/russian-super-...,False


In [7]:
players.head()

Unnamed: 0,player_id,first_name,last_name,name,last_season,current_club_id,player_code,country_of_birth,city_of_birth,country_of_citizenship,...,foot,height_in_cm,contract_expiration_date,agent_name,image_url,url,current_club_domestic_competition_id,current_club_name,market_value_in_eur,highest_market_value_in_eur
0,10,Miroslav,Klose,Miroslav Klose,2015,398,miroslav-klose,Poland,Opole,Germany,...,right,184.0,,ASBW Sport Marketing,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/miroslav-klose...,IT1,Società Sportiva Lazio S.p.A.,1000000.0,30000000.0
1,26,Roman,Weidenfeller,Roman Weidenfeller,2017,16,roman-weidenfeller,Germany,Diez,Germany,...,left,190.0,,Neubauer 13 GmbH,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/roman-weidenfe...,L1,Borussia Dortmund,750000.0,8000000.0
2,65,Dimitar,Berbatov,Dimitar Berbatov,2015,1091,dimitar-berbatov,Bulgaria,Blagoevgrad,Bulgaria,...,,,,CSKA-AS-23 Ltd.,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/dimitar-berbat...,GR1,Panthessalonikios Athlitikos Omilos Konstantin...,1000000.0,34500000.0
3,77,,Lúcio,Lúcio,2012,506,lucio,Brazil,Brasília,Brazil,...,,,,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/lucio/profil/s...,IT1,Juventus Football Club,200000.0,24500000.0
4,80,Tom,Starke,Tom Starke,2017,27,tom-starke,East Germany (GDR),Freital,Germany,...,right,194.0,,IFM,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/tom-starke/pro...,L1,FC Bayern München,100000.0,3000000.0


In [8]:
players_eval.head()

Unnamed: 0,player_id,date,market_value_in_eur,current_club_id,player_club_domestic_competition_id
0,405973,2000-01-20,150000,3057,BE1
1,342216,2001-07-20,100000,1241,SC1
2,3132,2003-12-09,400000,126,TR1
3,6893,2003-12-15,900000,984,GB1
4,10,2004-10-04,7000000,398,IT1


In [9]:
transf.head()

Unnamed: 0,player_id,transfer_date,transfer_season,from_club_id,to_club_id,from_club_name,to_club_name,transfer_fee,market_value_in_eur,player_name
0,16136,2026-07-01,26/27,417,123,OGC Nice,Retired,,500000.0,Dante
1,1138758,2026-07-01,26/27,336,631,Sporting CP,Chelsea,52140000.0,45000000.0,Geovany Quenda
2,195778,2026-06-30,25/26,79,27,VfB Stuttgart,Bayern Munich,0.0,12000000.0,Alexander Nübel
3,569033,2026-06-30,25/26,39,27,1.FSV Mainz 05,Bayern Munich,0.0,4000000.0,Armindo Sieb
4,626913,2026-06-30,25/26,398,380,Lazio,Salernitana,0.0,15000000.0,Boulaye Dia


# Data Preprocessing

In [10]:
def clean_market_value(value):
    """Clean and standardize market value data"""
    if pd.isna(value) or value == '' or str(value).lower() == 'nan':
        return 0
    
    try:
        value_str = str(value).strip()
        value_str = re.sub(r'[€$£¥]', '', value_str)
        value_str = re.sub(r'[+\-±]', '', value_str)
        value_str = value_str.replace(',', '')
        
        if 'm' in value_str.lower():
            value_str = value_str.lower().replace('m', '')
            multiplier = 1_000_000
        elif 'k' in value_str.lower():
            value_str = value_str.lower().replace('k', '')
            multiplier = 1_000
        else:
            multiplier = 1
        
        numeric_match = re.search(r'(\d+(?:\.\d+)?)', value_str)
        if numeric_match:
            numeric_value = float(numeric_match.group(1))
            final_value = numeric_value * multiplier
            if 0 <= final_value <= 500_000_000:
                return int(final_value)
        return 0
    except (ValueError, TypeError):
        return 0

def clean_club_name(name):
    if pd.isna(name) or name == '' or str(name).lower() == 'nan':
        return ''
    
    name_str = str(name).strip()
    suffixes_to_clean = [
        r'\s+\(.*?\)$', r'\s+FC$', r'\s+F\.C\.$', r'\s+CF$', r'\s+C\.F\.$', r'\s+SC$', r'\s+S\.C\.$',
    ]
    for suffix in suffixes_to_clean:
        name_str = re.sub(suffix, '', name_str, flags=re.IGNORECASE)
    name_str = ' '.join(name_str.split())
    return name_str

def clean_players_data(df):
    if 'market_value_in_eur' in df.columns:
        df['market_value_in_eur'] = df['market_value_in_eur'].apply(clean_market_value)
    if 'highest_market_value_in_eur' in df.columns:
        df['highest_market_value_in_eur'] = df['highest_market_value_in_eur'].apply(clean_market_value)
    if 'current_club_name' in df.columns:
        df['current_club_name'] = df['current_club_name'].apply(clean_club_name)
    for name_col in ['name', 'first_name', 'last_name']:
        if name_col in df.columns:
            df[name_col] = df[name_col].fillna('').astype(str).str.strip()
    text_fields = ['position', 'sub_position', 'foot', 'country_of_citizenship', 'city_of_birth', 'country_of_birth', 'agent_name']
    for field in text_fields:
        if field in df.columns:
            df[field] = df[field].fillna('').astype(str).str.strip()
    return df

def clean_clubs_data(df):
    if 'name' in df.columns:
        df['name'] = df['name'].apply(clean_club_name)
    if 'total_market_value' in df.columns:
        df['total_market_value'] = df['total_market_value'].apply(clean_market_value)
    if 'net_transfer_record' in df.columns:
        df['net_transfer_record'] = df['net_transfer_record'].apply(clean_market_value)
    return df

def calculate_age(birth_date):
    if pd.isna(birth_date) or birth_date == '' or str(birth_date).lower() == 'nan':
        return 0
    try:
        birth = pd.to_datetime(birth_date)
        today = pd.Timestamp.now()
        age = int((today - birth).days / 365.25)
        if 16 <= age <= 45:
            return age
        else:
            return 0
    except:
        return 0

def clean_birth_place(player_row):
    city = str(player_row.get('city_of_birth', '')).strip()
    country = str(player_row.get('country_of_birth', '')).strip()
    if city and country and city.lower() != 'nan' and country.lower() != 'nan':
        return f"{city}, {country}"
    elif country and country.lower() != 'nan':
        return country
    elif city and city.lower() != 'nan':
        return city
    else:
        return ""

## Data Preprocessing Examples using the functions above

In [11]:
print(clean_market_value("€3.5m"))      
print(clean_market_value("$700k"))     
print(clean_market_value("2,000,000"))  
print(clean_market_value("N/A"))        
print(clean_market_value(None))         
print(clean_market_value("3.2M+"))      

3500000
700000
2000000
0
0
3200000


In [12]:
print(clean_club_name("Arsenal FC"))           
print(clean_club_name("Real Madrid C.F."))     
print(clean_club_name("Manchester United (Women)")) 

Arsenal
Real Madrid
Manchester United


In [13]:
df_ex = pd.DataFrame({
    'market_value_in_eur': ['€3.5m', '1.2k', None],
    'current_club_name': ['Arsenal FC', 'Real Madrid C.F.', None],
    'name': [' Messi ', None, 'Mbappe'],
    'position': ['Forward ', None, 'Midfielder']
})
cleaned_df = clean_players_data(df_ex)
cleaned_df


Unnamed: 0,market_value_in_eur,current_club_name,name,position
0,3500000,Arsenal,Messi,Forward
1,1200,Real Madrid,,
2,0,,Mbappe,Midfielder


In [14]:
df_ex_2 = pd.DataFrame({
    'name': ['Liverpool FC', 'Chelsea (Women)'],
    'total_market_value': ['€890m', '€55.2m'],
    'net_transfer_record': ['-€120m', '€3.1m']
})
cleaned = clean_clubs_data(df_ex_2)
cleaned

Unnamed: 0,name,total_market_value,net_transfer_record
0,Liverpool,0,120000000
1,Chelsea,55200000,3100000


In [15]:
def get_club_context(club_id, dataframes):
    """Get enhanced club context"""
    if not club_id or 'clubs' not in dataframes:
        return {}
    club_info = dataframes['clubs'][dataframes['clubs']['club_id'] == club_id]
    if club_info.empty:
        return {}
    club_row = club_info.iloc[0]
    return {
        'club_name': clean_club_name(club_row.get('name', '')),
        'league': str(club_row.get('domestic_competition_id', '')),
        'club_market_value': clean_market_value(club_row.get('total_market_value', 0)),
        'squad_size': int(club_row.get('squad_size', 0)) if pd.notna(club_row.get('squad_size')) else 0,
        'stadium': str(club_row.get('stadium_name', '')).strip(),
        'coach': str(club_row.get('coach_name', '')).strip()
    }

def analyze_competition_experience(player_apps, competition_lookup):
    """Analyze player's competition experience with improved competition mapping"""
    comp_stats = []
    for comp_id, comp_data in player_apps.groupby('competition_id'):
        comp_info = competition_lookup.get(comp_id, {})
        comp_name = comp_info.get('name', str(comp_id))
        comp_type = comp_info.get('type', 'unknown')
        comp_country = comp_info.get('country', '')
        is_major = comp_info.get('is_major', False)
        comp_stat = {
            'competition_id': str(comp_id),
            'competition_name': str(comp_name),
            'competition_type': comp_type,
            'country': comp_country,
            'is_major_league': is_major,
            'appearances': len(comp_data),
            'goals': int(comp_data['goals'].sum()),
            'assists': int(comp_data['assists'].sum()),
            'minutes': int(comp_data['minutes_played'].sum()),
            'goals_per_game': round(comp_data['goals'].sum() / len(comp_data), 3),
            'assists_per_game': round(comp_data['assists'].sum() / len(comp_data), 3)
        }
        comp_stats.append(comp_stat)
    comp_stats.sort(key=lambda x: (x['is_major_league'], x['appearances'], x['goals']), reverse=True)
    return comp_stats

def extract_league_experience(competition_breakdown):
    """Extract league experience for RAG filtering with improved mapping"""
    league_experience = {
        'top_5_leagues': [], 'major_leagues': [], 'european_competitions': [], 'total_leagues': 0
    }
    for comp in competition_breakdown:
        comp_id = comp['competition_id']
        comp_name = comp['competition_name'].lower()
        comp_type = comp.get('competition_type', '')
        top_5_ids = ['GB1', 'ES1', 'L1', 'IT1', 'FR1']
        if comp_id in top_5_ids:
            league_name = {
                'GB1': 'premier league', 'ES1': 'la liga', 'L1': 'bundesliga',
                'IT1': 'serie a', 'FR1': 'ligue 1'
            }.get(comp_id, comp_name)
            league_experience['top_5_leagues'].append({'league': league_name, 'appearances': comp['appearances'], 'goals': comp['goals'], 'assists': comp['assists']})
        elif comp_type in ['uefa_champions_league', 'europa_league', 'uefa_europa_conference_league'] or comp_id in ['CL', 'EL', 'UCOL']:
            comp_name_clean = {
                'CL': 'champions league', 'EL': 'europa league', 'UCOL': 'conference league'
            }.get(comp_id, comp_name)
            league_experience['european_competitions'].append({'competition': comp_name_clean, 'appearances': comp['appearances'], 'goals': comp['goals']})
        elif comp_type == 'domestic_league' and comp['appearances'] >= 5:
            league_experience['major_leagues'].append({'league': comp_name, 'appearances': comp['appearances'], 'goals': comp['goals']})
    league_experience['total_leagues'] = len(set([item['league'] for item in league_experience['top_5_leagues'] + league_experience['major_leagues']]))
    return league_experience

def analyze_recent_form(player_apps):
    """Analyze recent form with date sorting"""
    if 'date' in player_apps.columns:
        recent_apps = player_apps.sort_values('date').tail(10)
    else:
        recent_apps = player_apps.tail(10)
    if len(recent_apps) == 0:
        return {}
    return {
        'appearances': len(recent_apps), 'goals': int(recent_apps['goals'].sum()), 'assists': int(recent_apps['assists'].sum()),
        'minutes': int(recent_apps['minutes_played'].sum()), 'goals_per_game': round(recent_apps['goals'].sum() / len(recent_apps), 3),
        'assists_per_game': round(recent_apps['assists'].sum() / len(recent_apps), 3), 'avg_minutes': round(recent_apps['minutes_played'].mean(), 1)
    }

def analyze_performance_trends(player_apps):
    """Analyze performance trends over time"""
    if len(player_apps) < 10:
        return {'trend': 'insufficient_data'}
    if 'date' in player_apps.columns:
        apps_sorted = player_apps.sort_values('date')
    else:
        apps_sorted = player_apps
    mid_point = len(apps_sorted) // 2
    first_half = apps_sorted.iloc[:mid_point]
    second_half = apps_sorted.iloc[mid_point:]
    first_goals_per_game = first_half['goals'].sum() / len(first_half) if len(first_half) > 0 else 0
    second_goals_per_game = second_half['goals'].sum() / len(second_half) if len(second_half) > 0 else 0
    if second_goals_per_game > first_goals_per_game * 1.2:
        trend = 'improving'
    elif second_goals_per_game < first_goals_per_game * 0.8:
        trend = 'declining'
    else:
        trend = 'stable'
    return {
        'trend': trend, 'early_career_goals_per_game': round(first_goals_per_game, 3),
        'recent_career_goals_per_game': round(second_goals_per_game, 3),
        'improvement_factor': round(second_goals_per_game / max(first_goals_per_game, 0.001), 2)
    }

def calculate_player_career_stats(player_id, dataframes, competition_lookup):
    """Calculate comprehensive career statistics for a player"""
    if 'appearances' not in dataframes:
        return {}
    player_apps = dataframes['appearances'][dataframes['appearances']['player_id'] == player_id]
    if player_apps.empty:
        return {}
    current_year = datetime.now().year
    if 'date' in player_apps.columns:
        player_apps = player_apps[pd.to_datetime(player_apps['date'], errors='coerce').dt.year >= (current_year - 5)]
    total_appearances = len(player_apps)
    total_goals = player_apps['goals'].sum()
    total_assists = player_apps['assists'].sum()
    total_minutes = player_apps['minutes_played'].sum()
    total_yellow_cards = player_apps['yellow_cards'].sum()
    total_red_cards = player_apps['red_cards'].sum()
    career_stats = {
        'total_appearances': int(total_appearances), 'total_goals': int(total_goals), 'total_assists': int(total_assists),
        'total_minutes': int(total_minutes), 'total_yellow_cards': int(total_yellow_cards), 'total_red_cards': int(total_red_cards),
        'goals_per_appearance': 0.0, 'assists_per_appearance': 0.0, 'minutes_per_appearance': 0.0,
        'goal_contributions_per_90': 0.0, 'discipline_score': 0.0
    }
    if total_appearances > 0:
        career_stats['goals_per_appearance'] = round(total_goals / total_appearances, 4)
        career_stats['assists_per_appearance'] = round(total_assists / total_appearances, 4)
        career_stats['minutes_per_appearance'] = round(total_minutes / total_appearances, 1)
        career_stats['discipline_score'] = round((total_yellow_cards + (total_red_cards * 2)) / total_appearances, 3)
    if total_minutes > 0:
        goal_contributions = total_goals + total_assists
        career_stats['goal_contributions_per_90'] = round((goal_contributions * 90) / total_minutes, 3)
    career_stats['competition_breakdown'] = analyze_competition_experience(player_apps, competition_lookup)
    career_stats['league_experience'] = extract_league_experience(career_stats['competition_breakdown'])
    career_stats['recent_form'] = analyze_recent_form(player_apps)
    career_stats['performance_trends'] = analyze_performance_trends(player_apps)
    return career_stats

def calculate_transfer_history(player_id, dataframes):
    """Calculate transfer history and patterns"""
    if 'transfers' not in dataframes:
        return {}
    player_transfers = dataframes['transfers'][dataframes['transfers']['player_id'] == player_id].sort_values('transfer_date')
    if player_transfers.empty:
        return {}
    current_year = datetime.now().year
    player_transfers = player_transfers[pd.to_datetime(player_transfers['transfer_date'], errors='coerce').dt.year >= (current_year - 10)]
    transfer_fees = player_transfers['transfer_fee'].apply(clean_market_value)
    market_values = player_transfers['market_value_in_eur'].apply(clean_market_value)
    transfer_info = {
        'total_transfers': len(player_transfers), 'total_transfer_fees': float(transfer_fees.sum()),
        'average_transfer_fee': float(transfer_fees.mean()) if len(transfer_fees) > 0 else 0,
        'highest_transfer_fee': float(transfer_fees.max()) if len(transfer_fees) > 0 else 0,
        'transfer_frequency': 0.0, 'career_trajectory': 'unknown', 'recent_transfers': []
    }
    if len(player_transfers) > 1:
        try:
            first_transfer = pd.to_datetime(player_transfers['transfer_date'].iloc[0])
            last_transfer = pd.to_datetime(player_transfers['transfer_date'].iloc[-1])
            years_span = (last_transfer - first_transfer).days / 365.25
            transfer_info['transfer_frequency'] = round(years_span / (len(player_transfers) - 1), 2) if years_span > 0 else 0
        except:
            transfer_info['transfer_frequency'] = 0
    valid_market_values = market_values[market_values > 0]
    if len(valid_market_values) > 1:
        if valid_market_values.iloc[-1] > valid_market_values.iloc[0] * 1.3:
            transfer_info['career_trajectory'] = 'upward'
        elif valid_market_values.iloc[-1] < valid_market_values.iloc[0] * 0.7:
            transfer_info['career_trajectory'] = 'downward'
        else:
            transfer_info['career_trajectory'] = 'stable'
    recent = player_transfers.tail(3)
    for _, transfer in recent.iterrows():
        transfer_info['recent_transfers'].append({
            'date': str(transfer['transfer_date']), 'from_club': clean_club_name(transfer.get('from_club_name', '')),
            'to_club': clean_club_name(transfer.get('to_club_name', '')),
            'fee': float(transfer.get('transfer_fee', 0)) if pd.notna(transfer.get('transfer_fee')) else 0,
            'market_value': float(transfer.get('market_value_in_eur', 0)) if pd.notna(transfer.get('market_value_in_eur')) else 0
        })
    return transfer_info

def calculate_market_value_trends(player_id, dataframes):
    """Calculate market value trends and patterns"""
    if 'player_valuations' not in dataframes:
        return {}
    valuations = dataframes['player_valuations'][dataframes['player_valuations']['player_id'] == player_id].sort_values('date')
    if valuations.empty:
        return {}
    current_year = datetime.now().year
    valuations = valuations[pd.to_datetime(valuations['date'], errors='coerce').dt.year >= (current_year - 5)]
    clean_values = valuations['market_value_in_eur'].apply(clean_market_value)
    clean_values = clean_values[clean_values > 0]
    if len(clean_values) == 0:
        return {}
    market_info = {
        'current_market_value': float(clean_values.iloc[-1]), 'peak_market_value': float(clean_values.max()),
        'lowest_market_value': float(clean_values.min()), 'value_trend': 'stable', 'value_volatility': 0.0,
        'recent_value_change': 0.0, 'value_growth_rate': 0.0
    }
    if len(clean_values) > 1:
        recent_values = clean_values.tail(min(3, len(clean_values)))
        early_values = clean_values.head(min(3, len(clean_values)))
        if len(recent_values) > 0 and len(early_values) > 0:
            recent_avg = recent_values.mean()
            early_avg = early_values.mean()
            trend_change = (recent_avg - early_avg) / early_avg
            if trend_change > 0.15:
                market_info['value_trend'] = 'rising'
            elif trend_change < -0.15:
                market_info['value_trend'] = 'declining'
            else:
                market_info['value_trend'] = 'stable'
            market_info['value_growth_rate'] = round(trend_change, 3)
        market_info['value_volatility'] = round(clean_values.std() / clean_values.mean(), 3)
        if len(clean_values) >= 2:
            market_info['recent_value_change'] = round((clean_values.iloc[-1] - clean_values.iloc[-2]) / clean_values.iloc[-2], 3)
    return market_info

def get_playing_style_indicators(player_id, dataframes, competition_lookup):
    """Derive playing style indicators from performance data"""
    career_stats = calculate_player_career_stats(player_id, dataframes, competition_lookup)
    if not career_stats:
        return {}
    style_indicators = {
        'attacking_threat': 0.0, 'creativity': 0.0, 'consistency': 0.0, 'experience_level': 'unknown',
        'goal_scoring_ability': 'unknown', 'discipline': 'unknown', 'versatility': 0,
        'big_game_experience': False, 'international_experience': False
    }
    style_indicators['attacking_threat'] = career_stats.get('goal_contributions_per_90', 0)
    total_goals = career_stats.get('total_goals', 0)
    total_assists = career_stats.get('total_assists', 0)
    if total_goals + total_assists > 0:
        style_indicators['creativity'] = round(total_assists / (total_goals + total_assists), 3)
    minutes_per_app = career_stats.get('minutes_per_appearance', 0)
    style_indicators['consistency'] = round(min(minutes_per_app / 90, 1.0), 3)
    appearances = career_stats.get('total_appearances', 0)
    if appearances > 250: style_indicators['experience_level'] = 'veteran'
    elif appearances > 100: style_indicators['experience_level'] = 'experienced'
    elif appearances > 50: style_indicators['experience_level'] = 'developing'
    elif appearances > 15: style_indicators['experience_level'] = 'emerging'
    else: style_indicators['experience_level'] = 'young'
    goals_per_game = career_stats.get('goals_per_appearance', 0)
    if goals_per_game > 0.8: style_indicators['goal_scoring_ability'] = 'world_class'
    elif goals_per_game > 0.6: style_indicators['goal_scoring_ability'] = 'prolific'
    elif goals_per_game > 0.4: style_indicators['goal_scoring_ability'] = 'regular'
    elif goals_per_game > 0.2: style_indicators['goal_scoring_ability'] = 'occasional'
    elif goals_per_game > 0.05: style_indicators['goal_scoring_ability'] = 'rare'
    else: style_indicators['goal_scoring_ability'] = 'non_scorer'
    discipline_score = career_stats.get('discipline_score', 0)
    if discipline_score > 0.4: style_indicators['discipline'] = 'poor'
    elif discipline_score > 0.25: style_indicators['discipline'] = 'questionable'
    elif discipline_score > 0.15: style_indicators['discipline'] = 'average'
    elif discipline_score > 0.05: style_indicators['discipline'] = 'good'
    else: style_indicators['discipline'] = 'excellent'
    comp_breakdown = career_stats.get('competition_breakdown', [])
    style_indicators['versatility'] = len(comp_breakdown)
    league_exp = career_stats.get('league_experience', {})
    european_comps = league_exp.get('european_competitions', [])
    top_5_leagues = league_exp.get('top_5_leagues', [])
    style_indicators['big_game_experience'] = len(european_comps) > 0 or len(top_5_leagues) > 0
    style_indicators['international_experience'] = len(top_5_leagues) > 0
    return style_indicators

def create_scouting_text(profile):
    """Create rich text description for embedding"""
    text_parts = []
    name = profile.get('name', 'Unknown Player')
    age = profile.get('age', 0)
    nationality = profile.get('nationality', 'Unknown')
    position = profile.get('position', 'Unknown')
    sub_position = profile.get('sub_position', '')
    position_desc = f"{position}"
    if sub_position and sub_position != position and sub_position.lower() != 'nan':
        position_desc += f" ({sub_position})"
    text_parts.append(f"{name} is a {age}-year-old {position_desc} from {nationality}")
    club = profile.get('current_club_name', '')
    if club and club.lower() != 'nan':
        text_parts.append(f"currently playing for {club}")
    height = profile.get('height_cm', 0)
    foot = profile.get('preferred_foot', '')
    physical_desc = []
    if height > 0:
        if height > 190: physical_desc.append(f"tall {height}cm")
        elif height < 170: physical_desc.append(f"compact {height}cm")
        else: physical_desc.append(f"{height}cm")
    if foot and foot.lower() not in ['nan', '']: physical_desc.append(f"{foot}-footed")
    if physical_desc: text_parts.append(f"Physical: {', '.join(physical_desc)}")
    career_stats = profile.get('career_stats', {})
    if career_stats:
        appearances = career_stats.get('total_appearances', 0)
        goals = career_stats.get('total_goals', 0)
        assists = career_stats.get('total_assists', 0)
        if appearances > 0:
            text_parts.append(f"Career: {appearances} apps, {goals} goals, {assists} assists")
            goals_per_game = career_stats.get('goals_per_appearance', 0)
            assists_per_game = career_stats.get('assists_per_appearance', 0)
            goal_contrib_per_90 = career_stats.get('goal_contributions_per_90', 0)
            performance_desc = f"{goals_per_game:.2f} goals/game, {assists_per_game:.2f} assists/game"
            if goal_contrib_per_90 > 0: performance_desc += f", {goal_contrib_per_90:.2f} contributions/90min"
            text_parts.append(f"Performance: {performance_desc}")
    playing_style = profile.get('playing_style', {})
    if playing_style:
        experience = playing_style.get('experience_level', '')
        goal_ability = playing_style.get('goal_scoring_ability', '')
        discipline = playing_style.get('discipline', '')
        big_game_exp = playing_style.get('big_game_experience', False)
        style_desc = []
        if experience and experience != 'unknown': style_desc.append(f"{experience} player")
        if goal_ability and goal_ability != 'unknown': style_desc.append(f"{goal_ability} goalscorer")
        if discipline and discipline != 'unknown': style_desc.append(f"{discipline} discipline")
        if big_game_exp: style_desc.append("big game experience")
        if style_desc: text_parts.append(f"Profile: {', '.join(style_desc)}")
    league_exp = career_stats.get('league_experience', {})
    if league_exp:
        top_5_leagues = league_exp.get('top_5_leagues', [])
        european_comps = league_exp.get('european_competitions', [])
        experience_desc = []
        if top_5_leagues:
            leagues = [league['league'] for league in top_5_leagues]
            experience_desc.append(f"Top 5 leagues: {', '.join(leagues)}")
        if european_comps:
            comps = [comp['competition'] for comp in european_comps]
            experience_desc.append(f"European: {', '.join(comps)}")
        if experience_desc: text_parts.append(f"Experience: {'; '.join(experience_desc)}")
    current_value = profile.get('current_market_value', 0)
    peak_value = profile.get('highest_market_value', 0)
    if current_value > 0:
        if current_value >= 100_000_000: text_parts.append(f"Elite market value: €{current_value:,}")
        elif current_value >= 50_000_000: text_parts.append(f"High market value: €{current_value:,}")
        elif current_value >= 10_000_000: text_parts.append(f"Significant market value: €{current_value:,}")
        else: text_parts.append(f"Market value: €{current_value:,}")
    if peak_value > current_value * 1.5 and peak_value > 0: text_parts.append(f"Peak value: €{peak_value:,}")
    market_trends = profile.get('market_value_trends', {})
    if market_trends:
        trend = market_trends.get('value_trend', '')
        if trend == 'rising': text_parts.append("Value trending upward")
        elif trend == 'declining': text_parts.append("Value declining")
    contract_end = profile.get('contract_expiration', '')
    if contract_end and contract_end != '' and 'nan' not in str(contract_end).lower():
        try:
            contract_date = pd.to_datetime(contract_end)
            current_date = pd.Timestamp.now()
            months_remaining = (contract_date - current_date).days / 30
            if months_remaining < 6: text_parts.append("Contract expiring soon")
            elif months_remaining < 18: text_parts.append("Contract expires within 18 months")
        except: pass
    agent = profile.get('agent_name', '')
    if agent and agent != '' and 'nan' not in str(agent).lower(): text_parts.append(f"Agent: {agent}")
    return '. '.join(text_parts) + '.'

In [16]:
DATA_DIR = Path("data")  # Adjust this path as needed for your data files
OUTPUT_DIR = Path("embeddings") # Adjust this path as needed for your output
MODEL_NAME = "all-MiniLM-L6-v2"
LIMIT_PLAYERS = None # Set to an integer like 1000 for testing, or None for all players

OUTPUT_DIR.mkdir(exist_ok=True)

dataframes = {}
csv_files = {
    'players': 'players.csv',
    'appearances': 'appearances.csv', 
    'clubs': 'clubs.csv',
    'competitions': 'competitions.csv',
    'games': 'games.csv',
    'transfers': 'transfers.csv',
    'player_valuations': 'player_valuations.csv',
    'game_events': 'game_events.csv',
    'game_lineups': 'game_lineups.csv',
    'club_games': 'club_games.csv'
}

for name, filename in csv_files.items():
    file_path = DATA_DIR / filename
    if file_path.exists():
        try:
            df = pd.read_csv(file_path)
            if name == 'players':
                df = clean_players_data(df)
            elif name == 'clubs':
                df = clean_clubs_data(df)
            dataframes[name] = df
            logger.info(f"Loaded {len(df)} records from {filename}")
        except Exception as e:
            logger.error(f"Failed loading {filename}: {e}")
logger.info(f"Loaded {len(dataframes)} datasets")

# looking up competitoins
competition_lookup = {}
if 'competitions' in dataframes:
    for _, row in dataframes['competitions'].iterrows():
        comp_id = row.get('competition_id', '')
        comp_name = row.get('name', '')
        comp_code = row.get('competition_code', '')
        if comp_id:
            competition_lookup[comp_id] = {
                'name': comp_name,
                'code': comp_code,
                'type': row.get('type', ''),
                'country': row.get('country_name', ''),
                'is_major': row.get('is_major_national_league', False)
            }
    logger.info(f"Created lookup for {len(competition_lookup)} competitions")

# filtering players
if 'players' not in dataframes:
    logger.error("Players dataset not loaded! Cannot filter current players.")
else:
    original_player_count = len(dataframes['players'])
    current_players_df = dataframes['players'][dataframes['players']['last_season'] == 2024].copy()
    current_players_df = current_players_df[
        (current_players_df['name'].notna()) & 
        (current_players_df['name'] != '') &
        (current_players_df['position'].notna()) &
        (current_players_df['position'] != '')
    ].copy()
    if 'date_of_birth' in current_players_df.columns:
        current_players_df = current_players_df[
            (current_players_df['date_of_birth'].isna()) |  # Keep if no DOB data
            (pd.to_datetime(current_players_df['date_of_birth'], errors='coerce').notna())  # Valid dates only
        ].copy()
    if 'market_value_in_eur' in current_players_df.columns:
        current_players_df = current_players_df[
            (current_players_df['market_value_in_eur'].isna()) |  # Keep if no market value data
            (current_players_df['market_value_in_eur'] >= 25000)  # Minimum €25k market value
        ].copy()
    dataframes['players'] = current_players_df # Update the players dataframe to only contain current players
    logger.info(f"Filtered players: {original_player_count:,} → {len(current_players_df):,} currently active players")
    logger.info(f"Removed {original_player_count - len(current_players_df):,} retired/inactive players")

2025-06-26 13:52:46,668 - INFO - Loaded 32601 records from players.csv
2025-06-26 13:52:47,853 - INFO - Loaded 1706806 records from appearances.csv
2025-06-26 13:52:47,860 - INFO - Loaded 439 records from clubs.csv
2025-06-26 13:52:47,863 - INFO - Loaded 44 records from competitions.csv
2025-06-26 13:52:48,050 - INFO - Loaded 74026 records from games.csv
2025-06-26 13:52:48,100 - INFO - Loaded 79646 records from transfers.csv
2025-06-26 13:52:48,192 - INFO - Loaded 496606 records from player_valuations.csv
2025-06-26 13:52:48,890 - INFO - Loaded 1035043 records from game_events.csv
2025-06-26 13:52:50,864 - INFO - Loaded 2191911 records from game_lineups.csv
2025-06-26 13:52:50,936 - INFO - Loaded 148052 records from club_games.csv
2025-06-26 13:52:50,937 - INFO - Loaded 10 datasets
2025-06-26 13:52:50,938 - INFO - Created lookup for 44 competitions
2025-06-26 13:52:50,960 - INFO - Filtered players: 32,601 → 6,296 currently active players
2025-06-26 13:52:50,960 - INFO - Removed 26,305

In [None]:
logger.info(f"Creating comprehensive profiles for {len(dataframes['players']):,} players...")

processed_players = []
error_count = 0

players_to_process = dataframes['players'].head(LIMIT_PLAYERS) if LIMIT_PLAYERS else dataframes['players']

for idx, (_, player_row) in enumerate(players_to_process.iterrows()):
    if idx % 1000 == 0:
        logger.info(f"Progress: {idx:,}/{len(players_to_process):,} players processed ({len(processed_players):,} successful)")
    
    try:
        player_id = player_row['player_id']
        
        player_info = dataframes['players'][dataframes['players']['player_id'] == player_id].iloc[0]
        
        profile = {
            'player_id': int(player_id),
            'name': str(player_info.get('name', '')).strip(),
            'first_name': str(player_info.get('first_name', '')).strip(),
            'last_name': str(player_info.get('last_name', '')).strip(),
            'date_of_birth': str(player_info.get('date_of_birth', '')),
            'age': calculate_age(player_info.get('date_of_birth')),
            'nationality': str(player_info.get('country_of_citizenship', '')).strip(),
            'birth_place': clean_birth_place(player_info),
            'position': str(player_info.get('position', '')).strip(),
            'sub_position': str(player_info.get('sub_position', '')).strip(),
            'preferred_foot': str(player_info.get('foot', '')).strip(),
            'height_cm': float(player_info.get('height_in_cm', 0)) if pd.notna(player_info.get('height_in_cm')) else 0,
            'current_club_id': player_info.get('current_club_id'),
            'current_club_name': clean_club_name(player_info.get('current_club_name', '')),
            'contract_expiration': str(player_info.get('contract_expiration_date', '')),
            'agent_name': str(player_info.get('agent_name', '')).strip(),
            'current_market_value': clean_market_value(player_info.get('market_value_in_eur', 0)),
            'highest_market_value': clean_market_value(player_info.get('highest_market_value_in_eur', 0)),
            'career_stats': calculate_player_career_stats(player_id, dataframes, competition_lookup),
            'transfer_history': calculate_transfer_history(player_id, dataframes),
            'market_value_trends': calculate_market_value_trends(player_id, dataframes),
            'playing_style': get_playing_style_indicators(player_id, dataframes, competition_lookup),
        }
        profile['club_context'] = get_club_context(profile['current_club_id'], dataframes)
        
        if profile and profile.get('name'):
            career_stats = profile.get('career_stats', {})
            appearances = career_stats.get('total_appearances', 0)
            
            if appearances >= 5 or profile.get('current_market_value', 0) >= 100000:
                embedding_text = create_scouting_text(profile)
                player_record = {
                    'player_id': int(player_id),
                    'name': profile.get('name', ''),
                    'position': profile.get('position', ''),
                    'age': profile.get('age', 0),
                    'nationality': profile.get('nationality', ''),
                    'current_club': profile.get('current_club_name', ''),
                    'market_value': profile.get('current_market_value', 0),
                    'embedding_text': embedding_text,
                    'metadata': profile
                }
                processed_players.append(player_record)
                
    except Exception as e:
        error_count += 1
        if error_count <= 10:
            logger.warning(f"Error processing player {player_row.get('player_id', 'unknown')}: {e}")
        continue

if error_count > 10:
    logger.warning(f"Total errors: {error_count} (showing first 10 only)")

logger.info(f"Successfully created {len(processed_players):,} comprehensive player profiles")

# Display a sample profile
if processed_players:
    print("\nSample Player Profile:")
    print(json.dumps(processed_players[0], indent=2, default=str))
else:
    logger.error("No player profiles generated.")

2025-06-26 13:52:50,975 - INFO - Creating comprehensive profiles for 6,296 players...
2025-06-26 13:52:50,979 - INFO - Progress: 0/6,296 players processed (0 successful)
2025-06-26 13:52:57,329 - INFO - Progress: 1,000/6,296 players processed (997 successful)
2025-06-26 13:53:02,340 - INFO - Progress: 2,000/6,296 players processed (1,992 successful)


In [None]:
# --- Initialize Embedding Model ---
logger.info(f"Loading embedding model: {MODEL_NAME}")
embedding_model = SentenceTransformer(MODEL_NAME)

# --- Generate Embeddings ---
logger.info(f"Generating embeddings for {len(processed_players):,} players...")

texts_for_embedding = []
valid_indices_for_embedding = []

for i, record in enumerate(processed_players):
    text = record.get('embedding_text', '').strip()
    if text and len(text) > 10:
        texts_for_embedding.append(text)
        valid_indices_for_embedding.append(i)
    else:
        logger.warning(f"Skipping player {record.get('name', 'unknown')} - insufficient text for embedding")

logger.info(f"Processing {len(texts_for_embedding):,} valid texts for embedding")

if not texts_for_embedding:
    logger.error("No valid texts to process for embedding generation.")
else:
    batch_size = 32
    all_embeddings = []
    
    try:
        for i in range(0, len(texts_for_embedding), batch_size):
            batch_texts = texts_for_embedding[i:i + batch_size]
            batch_num = i // batch_size + 1
            total_batches = (len(texts_for_embedding) + batch_size - 1) // batch_size
            
            logger.info(f"Processing embedding batch {batch_num}/{total_batches}")
            
            try:
                batch_embeddings = embedding_model.encode(
                    batch_texts,
                    show_progress_bar=False,
                    batch_size=min(16, len(batch_texts)),
                    convert_to_numpy=True,
                    normalize_embeddings=True
                )
                all_embeddings.extend(batch_embeddings.tolist())
                
            except Exception as e:
                logger.error(f"Embedding batch {batch_num} failed: {e}")
                embedding_dim = 384 # Assuming this is the dimension of all-MiniLM-L6-v2
                for _ in batch_texts:
                    all_embeddings.append([0.0] * embedding_dim)
        
        embedding_idx = 0
        for record_idx in valid_indices_for_embedding:
            if embedding_idx < len(all_embeddings):
                processed_players[record_idx]['embedding'] = all_embeddings[embedding_idx]
                embedding_idx += 1
            else:
                # Fallback for any records that somehow missed an embedding
                processed_players[record_idx]['embedding'] = [0.0] * 384
        
        logger.info(f"Generated embeddings for {len(all_embeddings):,} players")
        
    except Exception as e:
        logger.error(f"Overall embedding generation failed: {e}")

2025-06-26 11:16:14,381 - INFO - Loading embedding model: all-MiniLM-L6-v2
2025-06-26 11:16:14,418 - INFO - Use pytorch device_name: mps
2025-06-26 11:16:14,419 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-06-26 11:16:16,596 - INFO - Generating embeddings for 6,148 players...
2025-06-26 11:16:16,605 - INFO - Processing 6,148 valid texts for embedding
2025-06-26 11:16:16,605 - INFO - Processing embedding batch 1/193
2025-06-26 11:16:17,627 - INFO - Processing embedding batch 2/193
2025-06-26 11:16:17,811 - INFO - Processing embedding batch 3/193
2025-06-26 11:16:17,973 - INFO - Processing embedding batch 4/193
2025-06-26 11:16:18,103 - INFO - Processing embedding batch 5/193
2025-06-26 11:16:18,258 - INFO - Processing embedding batch 6/193
2025-06-26 11:16:18,415 - INFO - Processing embedding batch 7/193
2025-06-26 11:16:18,549 - INFO - Processing embedding batch 8/193
2025-06-26 11:16:18,676 - INFO - Processing embedding batch 9/193
2025-06-26 11:16:18,809 - INFO

In [None]:
# --- Save Embeddings and Metadata ---
logger.info("Saving embeddings and metadata...")

valid_records_with_embeddings = [
    record for record in processed_players 
    if 'embedding' in record and record['embedding'] and len(record['embedding']) > 0
]

if not valid_records_with_embeddings:
    logger.error("No valid embeddings to save.")
else:
    logger.info(f"Saving {len(valid_records_with_embeddings):,} valid player records with embeddings")
    
    # Save embeddings as numpy arrays
    embeddings_array = np.array([record['embedding'] for record in valid_records_with_embeddings])
    np.save(OUTPUT_DIR / "player_embeddings.npy", embeddings_array)
    
    # Prepare metadata
    metadata = []
    for record in valid_records_with_embeddings:
        metadata.append({
            'player_id': record['player_id'],
            'name': record['name'],
            'position': record['position'],
            'age': record['age'],
            'nationality': record['nationality'],
            'current_club': record['current_club'],
            'market_value': record['market_value'],
            'embedding_text': record['embedding_text']
        })
    
    # Save metadata as JSON
    with open(OUTPUT_DIR / "player_metadata.json", 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)
    
    # Save metadata as CSV
    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv(OUTPUT_DIR / "player_metadata.csv", index=False)
    
    # Save detailed profiles
    detailed_profiles = [record['metadata'] for record in valid_records_with_embeddings]
    with open(OUTPUT_DIR / "detailed_player_profiles.json", 'w', encoding='utf-8') as f:
        json.dump(detailed_profiles, f, indent=2, ensure_ascii=False, default=str)
    
    # Create enhanced summary statistics
    summary_stats = {
        'total_players': len(valid_records_with_embeddings),
        'embedding_dimension': len(valid_records_with_embeddings[0]['embedding']) if valid_records_with_embeddings else 0,
        'data_quality': {
            'avg_text_length': np.mean([len(r['embedding_text']) for r in valid_records_with_embeddings]),
            'players_with_stats': len([r for r in valid_records_with_embeddings if r['metadata'].get('career_stats', {}).get('total_appearances', 0) > 0]),
            'players_with_market_value': len([r for r in valid_records_with_embeddings if r['market_value'] > 0]),
            'active_players_only': True,
            'last_season_filter': 2024
        },
        'position_distribution': metadata_df['position'].value_counts().to_dict(),
        'top_clubs': metadata_df['current_club'].value_counts().head(20).to_dict(),
        'age_distribution': {
            'mean': round(metadata_df['age'].mean(), 1),
            'median': int(metadata_df['age'].median()),
            'min': int(metadata_df['age'].min()),
            'max': int(metadata_df['age'].max()),
            'std': round(metadata_df['age'].std(), 1)
        },
        'market_value_stats': {
            'mean': round(metadata_df['market_value'].mean(), 0),
            'median': round(metadata_df['market_value'].median(), 0),
            'max': int(metadata_df['market_value'].max()),
            'players_over_1m': len(metadata_df[metadata_df['market_value'] > 1_000_000]),
            'players_over_10m': len(metadata_df[metadata_df['market_value'] > 10_000_000]),
            'players_over_50m': len(metadata_df[metadata_df['market_value'] > 50_000_000])
        },
        'nationality_distribution': metadata_df['nationality'].value_counts().head(20).to_dict(),
        'created_at': datetime.now().isoformat(),
        'model_used': MODEL_NAME
    }
    
    # Save summary
    with open(OUTPUT_DIR / "embedding_summary.json", 'w') as f:
        json.dump(summary_stats, f, indent=2, default=str)
    
    logger.info(f"Summary: {summary_stats['total_players']:,} current players, {summary_stats['embedding_dimension']} dimensions")
    logger.info(f"Data quality: {summary_stats['data_quality']['players_with_stats']:,} players with stats")
    logger.info(f"Market values: {summary_stats['market_value_stats']['players_over_1m']:,} players over €1M")
    logger.info("Embedding generation and saving complete.")

# Final check of the first few records with embeddings
if valid_records_with_embeddings:
    print("\nFirst 5 final player records (with embeddings):")
    for i, record in enumerate(valid_records_with_embeddings[:5]):
        print(f"Player ID: {record['player_id']}, Name: {record['name']}, Embedding Length: {len(record.get('embedding', []))}")
        if 'embedding_text' in record:
            print(f"  Embedding Text: {record['embedding_text'][:100]}...") # Print first 100 chars
        print("-" * 20)
else:
    print("\nNo valid player embeddings were saved.")

2025-06-26 11:16:41,658 - INFO - Saving embeddings and metadata...
2025-06-26 11:16:41,662 - INFO - Saving 6,148 valid player records with embeddings
2025-06-26 11:16:42,424 - INFO - Summary: 6,148 current players, 384 dimensions
2025-06-26 11:16:42,424 - INFO - Data quality: 5,968 players with stats
2025-06-26 11:16:42,425 - INFO - Market values: 3,469 players over €1M
2025-06-26 11:16:42,425 - INFO - Embedding generation and saving complete.



First 5 final player records (with embeddings):
Player ID: 3333, Name: James Milner, Embedding Length: 384
  Embedding Text: James Milner is a 39-year-old Midfield (Central Midfield) from England. currently playing for Bright...
--------------------
Player ID: 5336, Name: Anastasios Tsokanis, Embedding Length: 384
  Embedding Text: Anastasios Tsokanis is a 34-year-old Midfield (Defensive Midfield) from Greece. currently playing fo...
--------------------
Player ID: 7161, Name: Jonas Hofmann, Embedding Length: 384
  Embedding Text: Jonas Hofmann is a 32-year-old Midfield (Attacking Midfield) from Germany. currently playing for Bay...
--------------------
Player ID: 7825, Name: Pepe Reina, Embedding Length: 384
  Embedding Text: Pepe Reina is a 42-year-old Goalkeeper from Spain. currently playing for Calcio Como. Physical: 188....
--------------------
Player ID: 11530, Name: Lionel Carole, Embedding Length: 384
  Embedding Text: Lionel Carole is a 34-year-old Defender (Left-Back) from F