In [1]:
import pandas as pd
import numpy as np
import json
import logging
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from sentence_transformers import SentenceTransformer
import re

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
from typing import Dict, List, Optional, Any, Tuple
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, CrossEncoder
import google.generativeai as genai

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

torch.set_num_threads(1)

# Part 1) Embedding Generator

## Loading Data

- getting a snippet of what some of the data looks like

In [2]:
apps = pd.read_csv("data/appearances.csv")
club_games = pd.read_csv("data/club_games.csv")
clubs = pd.read_csv("data/clubs.csv")
comps = pd.read_csv("data/competitions.csv")
game_events = pd.read_csv("data/game_events.csv")
players = pd.read_csv("data/players.csv")

game_lineups = pd.read_csv("data/game_lineups.csv")
players_eval = pd.read_csv("data/player_valuations.csv")
transf = pd.read_csv("data/transfers.csv")

In [3]:
apps.head()

Unnamed: 0,appearance_id,game_id,player_id,player_club_id,player_current_club_id,date,player_name,competition_id,yellow_cards,red_cards,goals,assists,minutes_played
0,2231978_38004,2231978,38004,853,235,2012-07-03,Aurélien Joachim,CLQ,0,0,2,0,90
1,2233748_79232,2233748,79232,8841,2698,2012-07-05,Ruslan Abyshov,ELQ,0,0,0,0,90
2,2234413_42792,2234413,42792,6251,465,2012-07-05,Sander Puri,ELQ,0,0,0,0,45
3,2234418_73333,2234418,73333,1274,6646,2012-07-05,Vegar Hedenstad,ELQ,0,0,0,0,90
4,2234421_122011,2234421,122011,195,3008,2012-07-05,Markus Henriksen,ELQ,0,0,0,1,90


In [4]:
club_games.head()

Unnamed: 0,game_id,club_id,own_goals,own_position,own_manager_name,opponent_id,opponent_goals,opponent_position,opponent_manager_name,hosting,is_win
0,2320450,1468.0,0.0,,Holger Bachthaler,24.0,2.0,,Armin Veh,Home,0
1,2320454,222.0,0.0,,Volkan Uluc,79.0,2.0,,Bruno Labbadia,Home,0
2,2320460,1.0,3.0,,Jürgen Luginger,86.0,1.0,,Robin Dutt,Home,1
3,2320472,2036.0,4.0,,Frank Schmidt,72.0,5.0,,Alexander Schmidt,Home,0
4,2321027,33.0,3.0,8.0,Jens Keller,41.0,3.0,9.0,Thorsten Fink,Home,0


In [5]:
clubs.head()

Unnamed: 0,club_id,club_code,name,domestic_competition_id,total_market_value,squad_size,average_age,foreigners_number,foreigners_percentage,national_team_players,stadium_name,stadium_seats,net_transfer_record,coach_name,last_season,filename,url
0,105,sv-darmstadt-98,SV Darmstadt 98,L1,,27,25.6,13,48.1,1,Merck-Stadion am Böllenfalltor,17810,+€3.05m,,2023,../data/raw/transfermarkt-scraper/2023/clubs.j...,https://www.transfermarkt.co.uk/sv-darmstadt-9...
1,11127,ural-ekaterinburg,Ural Yekaterinburg,RU1,,30,26.5,11,36.7,3,Yekaterinburg Arena,23000,+€880k,,2023,../data/raw/transfermarkt-scraper/2023/clubs.j...,https://www.transfermarkt.co.uk/ural-ekaterinb...
2,114,besiktas-istanbul,Beşiktaş Jimnastik Kulübü,TR1,,30,26.6,15,50.0,8,Beşiktaş Park,42445,€-25.26m,,2024,../data/raw/transfermarkt-scraper/2024/clubs.j...,https://www.transfermarkt.co.uk/besiktas-istan...
3,12,as-rom,Associazione Sportiva Roma,IT1,,26,26.3,18,69.2,17,Olimpico di Roma,70634,€-76.90m,,2024,../data/raw/transfermarkt-scraper/2024/clubs.j...,https://www.transfermarkt.co.uk/as-rom/startse...
4,148,tottenham-hotspur,Tottenham Hotspur Football Club,GB1,,30,25.5,21,70.0,18,Tottenham Hotspur Stadium,62850,€-120.05m,,2024,../data/raw/transfermarkt-scraper/2024/clubs.j...,https://www.transfermarkt.co.uk/tottenham-hots...


In [6]:
comps.head()

Unnamed: 0,competition_id,competition_code,name,sub_type,type,country_id,country_name,domestic_league_code,confederation,url,is_major_national_league
0,CIT,italy-cup,italy-cup,domestic_cup,domestic_cup,75,Italy,IT1,europa,https://www.transfermarkt.co.uk/italy-cup/star...,False
1,NLSC,johan-cruijff-schaal,johan-cruijff-schaal,domestic_super_cup,other,122,Netherlands,NL1,europa,https://www.transfermarkt.co.uk/johan-cruijff-...,False
2,GRP,kypello-elladas,kypello-elladas,domestic_cup,domestic_cup,56,Greece,GR1,europa,https://www.transfermarkt.co.uk/kypello-ellada...,False
3,POSU,supertaca-candido-de-oliveira,supertaca-candido-de-oliveira,domestic_super_cup,other,136,Portugal,PO1,europa,https://www.transfermarkt.co.uk/supertaca-cand...,False
4,RUSS,russian-super-cup,russian-super-cup,domestic_super_cup,other,141,Russia,RU1,europa,https://www.transfermarkt.co.uk/russian-super-...,False


In [7]:
players.head()

Unnamed: 0,player_id,first_name,last_name,name,last_season,current_club_id,player_code,country_of_birth,city_of_birth,country_of_citizenship,...,foot,height_in_cm,contract_expiration_date,agent_name,image_url,url,current_club_domestic_competition_id,current_club_name,market_value_in_eur,highest_market_value_in_eur
0,10,Miroslav,Klose,Miroslav Klose,2015,398,miroslav-klose,Poland,Opole,Germany,...,right,184.0,,ASBW Sport Marketing,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/miroslav-klose...,IT1,Società Sportiva Lazio S.p.A.,1000000.0,30000000.0
1,26,Roman,Weidenfeller,Roman Weidenfeller,2017,16,roman-weidenfeller,Germany,Diez,Germany,...,left,190.0,,Neubauer 13 GmbH,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/roman-weidenfe...,L1,Borussia Dortmund,750000.0,8000000.0
2,65,Dimitar,Berbatov,Dimitar Berbatov,2015,1091,dimitar-berbatov,Bulgaria,Blagoevgrad,Bulgaria,...,,,,CSKA-AS-23 Ltd.,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/dimitar-berbat...,GR1,Panthessalonikios Athlitikos Omilos Konstantin...,1000000.0,34500000.0
3,77,,Lúcio,Lúcio,2012,506,lucio,Brazil,Brasília,Brazil,...,,,,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/lucio/profil/s...,IT1,Juventus Football Club,200000.0,24500000.0
4,80,Tom,Starke,Tom Starke,2017,27,tom-starke,East Germany (GDR),Freital,Germany,...,right,194.0,,IFM,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/tom-starke/pro...,L1,FC Bayern München,100000.0,3000000.0


In [8]:
players_eval.head()

Unnamed: 0,player_id,date,market_value_in_eur,current_club_id,player_club_domestic_competition_id
0,405973,2000-01-20,150000,3057,BE1
1,342216,2001-07-20,100000,1241,SC1
2,3132,2003-12-09,400000,126,TR1
3,6893,2003-12-15,900000,984,GB1
4,10,2004-10-04,7000000,398,IT1


In [9]:
transf.head()

Unnamed: 0,player_id,transfer_date,transfer_season,from_club_id,to_club_id,from_club_name,to_club_name,transfer_fee,market_value_in_eur,player_name
0,16136,2026-07-01,26/27,417,123,OGC Nice,Retired,,500000.0,Dante
1,1138758,2026-07-01,26/27,336,631,Sporting CP,Chelsea,52140000.0,45000000.0,Geovany Quenda
2,195778,2026-06-30,25/26,79,27,VfB Stuttgart,Bayern Munich,0.0,12000000.0,Alexander Nübel
3,569033,2026-06-30,25/26,39,27,1.FSV Mainz 05,Bayern Munich,0.0,4000000.0,Armindo Sieb
4,626913,2026-06-30,25/26,398,380,Lazio,Salernitana,0.0,15000000.0,Boulaye Dia


## Data Preprocessing

In [10]:
def clean_market_value(value):
    """Clean and standardize market value data"""
    if pd.isna(value) or value == '' or str(value).lower() == 'nan':
        return 0
    
    try:
        value_str = str(value).strip()
        value_str = re.sub(r'[€$£¥]', '', value_str)
        value_str = re.sub(r'[+\-±]', '', value_str)
        value_str = value_str.replace(',', '')
        
        if 'm' in value_str.lower():
            value_str = value_str.lower().replace('m', '')
            multiplier = 1_000_000
        elif 'k' in value_str.lower():
            value_str = value_str.lower().replace('k', '')
            multiplier = 1_000
        else:
            multiplier = 1
        
        numeric_match = re.search(r'(\d+(?:\.\d+)?)', value_str)
        if numeric_match:
            numeric_value = float(numeric_match.group(1))
            final_value = numeric_value * multiplier
            if 0 <= final_value <= 500_000_000:
                return int(final_value)
        return 0
    except (ValueError, TypeError):
        return 0

def clean_club_name(name):
    if pd.isna(name) or name == '' or str(name).lower() == 'nan':
        return ''
    
    name_str = str(name).strip()
    suffixes_to_clean = [
        r'\s+\(.*?\)$', r'\s+FC$', r'\s+F\.C\.$', r'\s+CF$', r'\s+C\.F\.$', r'\s+SC$', r'\s+S\.C\.$',
    ]
    for suffix in suffixes_to_clean:
        name_str = re.sub(suffix, '', name_str, flags=re.IGNORECASE)
    name_str = ' '.join(name_str.split())
    return name_str

def clean_players_data(df):
    if 'market_value_in_eur' in df.columns:
        df['market_value_in_eur'] = df['market_value_in_eur'].apply(clean_market_value)
    if 'highest_market_value_in_eur' in df.columns:
        df['highest_market_value_in_eur'] = df['highest_market_value_in_eur'].apply(clean_market_value)
    if 'current_club_name' in df.columns:
        df['current_club_name'] = df['current_club_name'].apply(clean_club_name)
    for name_col in ['name', 'first_name', 'last_name']:
        if name_col in df.columns:
            df[name_col] = df[name_col].fillna('').astype(str).str.strip()
    text_fields = ['position', 'sub_position', 'foot', 'country_of_citizenship', 'city_of_birth', 'country_of_birth', 'agent_name']
    for field in text_fields:
        if field in df.columns:
            df[field] = df[field].fillna('').astype(str).str.strip()
    return df

def clean_clubs_data(df):
    if 'name' in df.columns:
        df['name'] = df['name'].apply(clean_club_name)
    if 'total_market_value' in df.columns:
        df['total_market_value'] = df['total_market_value'].apply(clean_market_value)
    if 'net_transfer_record' in df.columns:
        df['net_transfer_record'] = df['net_transfer_record'].apply(clean_market_value)
    return df

def calculate_age(birth_date):
    if pd.isna(birth_date) or birth_date == '' or str(birth_date).lower() == 'nan':
        return 0
    try:
        birth = pd.to_datetime(birth_date)
        today = pd.Timestamp.now()
        age = int((today - birth).days / 365.25)
        if 16 <= age <= 45:
            return age
        else:
            return 0
    except:
        return 0

def clean_birth_place(player_row):
    city = str(player_row.get('city_of_birth', '')).strip()
    country = str(player_row.get('country_of_birth', '')).strip()
    if city and country and city.lower() != 'nan' and country.lower() != 'nan':
        return f"{city}, {country}"
    elif country and country.lower() != 'nan':
        return country
    elif city and city.lower() != 'nan':
        return city
    else:
        return ""

### Data Preprocessing Examples using the functions above

In [11]:
print(clean_market_value("€3.5m"))      
print(clean_market_value("$700k"))     
print(clean_market_value("2,000,000"))  
print(clean_market_value("N/A"))        
print(clean_market_value(None))         
print(clean_market_value("3.2M+"))      

3500000
700000
2000000
0
0
3200000


In [12]:
print(clean_club_name("Arsenal FC"))           
print(clean_club_name("Real Madrid C.F."))     
print(clean_club_name("Manchester United (Women)")) 

Arsenal
Real Madrid
Manchester United


In [13]:
df_ex = pd.DataFrame({
    'market_value_in_eur': ['€3.5m', '1.2k', None],
    'current_club_name': ['Arsenal FC', 'Real Madrid C.F.', None],
    'name': [' Messi ', None, 'Mbappe'],
    'position': ['Forward ', None, 'Midfielder']
})
cleaned_df = clean_players_data(df_ex)
cleaned_df


Unnamed: 0,market_value_in_eur,current_club_name,name,position
0,3500000,Arsenal,Messi,Forward
1,1200,Real Madrid,,
2,0,,Mbappe,Midfielder


In [14]:
df_ex_2 = pd.DataFrame({
    'name': ['Liverpool FC', 'Chelsea (Women)'],
    'total_market_value': ['€890m', '€55.2m'],
    'net_transfer_record': ['-€120m', '€3.1m']
})
cleaned = clean_clubs_data(df_ex_2)
cleaned

Unnamed: 0,name,total_market_value,net_transfer_record
0,Liverpool,0,120000000
1,Chelsea,55200000,3100000


In [15]:
def get_club_context(club_id, dataframes):
    """Get enhanced club context"""
    if not club_id or 'clubs' not in dataframes:
        return {}
    club_info = dataframes['clubs'][dataframes['clubs']['club_id'] == club_id]
    if club_info.empty:
        return {}
    club_row = club_info.iloc[0]
    return {
        'club_name': clean_club_name(club_row.get('name', '')),
        'league': str(club_row.get('domestic_competition_id', '')),
        'club_market_value': clean_market_value(club_row.get('total_market_value', 0)),
        'squad_size': int(club_row.get('squad_size', 0)) if pd.notna(club_row.get('squad_size')) else 0,
        'stadium': str(club_row.get('stadium_name', '')).strip(),
        'coach': str(club_row.get('coach_name', '')).strip()
    }

def analyze_competition_experience(player_apps, competition_lookup):
    """Analyze player's competition experience with improved competition mapping"""
    comp_stats = []
    for comp_id, comp_data in player_apps.groupby('competition_id'):
        comp_info = competition_lookup.get(comp_id, {})
        comp_name = comp_info.get('name', str(comp_id))
        comp_type = comp_info.get('type', 'unknown')
        comp_country = comp_info.get('country', '')
        is_major = comp_info.get('is_major', False)
        comp_stat = {
            'competition_id': str(comp_id),
            'competition_name': str(comp_name),
            'competition_type': comp_type,
            'country': comp_country,
            'is_major_league': is_major,
            'appearances': len(comp_data),
            'goals': int(comp_data['goals'].sum()),
            'assists': int(comp_data['assists'].sum()),
            'minutes': int(comp_data['minutes_played'].sum()),
            'goals_per_game': round(comp_data['goals'].sum() / len(comp_data), 3),
            'assists_per_game': round(comp_data['assists'].sum() / len(comp_data), 3)
        }
        comp_stats.append(comp_stat)
    comp_stats.sort(key=lambda x: (x['is_major_league'], x['appearances'], x['goals']), reverse=True)
    return comp_stats

def extract_league_experience(competition_breakdown):
    """Extract league experience for RAG filtering with improved mapping"""
    league_experience = {
        'top_5_leagues': [], 'major_leagues': [], 'european_competitions': [], 'total_leagues': 0
    }
    for comp in competition_breakdown:
        comp_id = comp['competition_id']
        comp_name = comp['competition_name'].lower()
        comp_type = comp.get('competition_type', '')
        top_5_ids = ['GB1', 'ES1', 'L1', 'IT1', 'FR1']
        if comp_id in top_5_ids:
            league_name = {
                'GB1': 'premier league', 'ES1': 'la liga', 'L1': 'bundesliga',
                'IT1': 'serie a', 'FR1': 'ligue 1'
            }.get(comp_id, comp_name)
            league_experience['top_5_leagues'].append({'league': league_name, 'appearances': comp['appearances'], 'goals': comp['goals'], 'assists': comp['assists']})
        elif comp_type in ['uefa_champions_league', 'europa_league', 'uefa_europa_conference_league'] or comp_id in ['CL', 'EL', 'UCOL']:
            comp_name_clean = {
                'CL': 'champions league', 'EL': 'europa league', 'UCOL': 'conference league'
            }.get(comp_id, comp_name)
            league_experience['european_competitions'].append({'competition': comp_name_clean, 'appearances': comp['appearances'], 'goals': comp['goals']})
        elif comp_type == 'domestic_league' and comp['appearances'] >= 5:
            league_experience['major_leagues'].append({'league': comp_name, 'appearances': comp['appearances'], 'goals': comp['goals']})
    league_experience['total_leagues'] = len(set([item['league'] for item in league_experience['top_5_leagues'] + league_experience['major_leagues']]))
    return league_experience

def analyze_recent_form(player_apps):
    """Analyze recent form with date sorting"""
    if 'date' in player_apps.columns:
        recent_apps = player_apps.sort_values('date').tail(10)
    else:
        recent_apps = player_apps.tail(10)
    if len(recent_apps) == 0:
        return {}
    return {
        'appearances': len(recent_apps), 'goals': int(recent_apps['goals'].sum()), 'assists': int(recent_apps['assists'].sum()),
        'minutes': int(recent_apps['minutes_played'].sum()), 'goals_per_game': round(recent_apps['goals'].sum() / len(recent_apps), 3),
        'assists_per_game': round(recent_apps['assists'].sum() / len(recent_apps), 3), 'avg_minutes': round(recent_apps['minutes_played'].mean(), 1)
    }

def analyze_performance_trends(player_apps):
    """Analyze performance trends over time"""
    if len(player_apps) < 10:
        return {'trend': 'insufficient_data'}
    if 'date' in player_apps.columns:
        apps_sorted = player_apps.sort_values('date')
    else:
        apps_sorted = player_apps
    mid_point = len(apps_sorted) // 2
    first_half = apps_sorted.iloc[:mid_point]
    second_half = apps_sorted.iloc[mid_point:]
    first_goals_per_game = first_half['goals'].sum() / len(first_half) if len(first_half) > 0 else 0
    second_goals_per_game = second_half['goals'].sum() / len(second_half) if len(second_half) > 0 else 0
    if second_goals_per_game > first_goals_per_game * 1.2:
        trend = 'improving'
    elif second_goals_per_game < first_goals_per_game * 0.8:
        trend = 'declining'
    else:
        trend = 'stable'
    return {
        'trend': trend, 'early_career_goals_per_game': round(first_goals_per_game, 3),
        'recent_career_goals_per_game': round(second_goals_per_game, 3),
        'improvement_factor': round(second_goals_per_game / max(first_goals_per_game, 0.001), 2)
    }

def calculate_player_career_stats(player_id, dataframes, competition_lookup):
    """Calculate comprehensive career statistics for a player"""
    if 'appearances' not in dataframes:
        return {}
    player_apps = dataframes['appearances'][dataframes['appearances']['player_id'] == player_id]
    if player_apps.empty:
        return {}
    current_year = datetime.now().year
    if 'date' in player_apps.columns:
        player_apps = player_apps[pd.to_datetime(player_apps['date'], errors='coerce').dt.year >= (current_year - 5)]
    total_appearances = len(player_apps)
    total_goals = player_apps['goals'].sum()
    total_assists = player_apps['assists'].sum()
    total_minutes = player_apps['minutes_played'].sum()
    total_yellow_cards = player_apps['yellow_cards'].sum()
    total_red_cards = player_apps['red_cards'].sum()
    career_stats = {
        'total_appearances': int(total_appearances), 'total_goals': int(total_goals), 'total_assists': int(total_assists),
        'total_minutes': int(total_minutes), 'total_yellow_cards': int(total_yellow_cards), 'total_red_cards': int(total_red_cards),
        'goals_per_appearance': 0.0, 'assists_per_appearance': 0.0, 'minutes_per_appearance': 0.0,
        'goal_contributions_per_90': 0.0, 'discipline_score': 0.0
    }
    if total_appearances > 0:
        career_stats['goals_per_appearance'] = round(total_goals / total_appearances, 4)
        career_stats['assists_per_appearance'] = round(total_assists / total_appearances, 4)
        career_stats['minutes_per_appearance'] = round(total_minutes / total_appearances, 1)
        career_stats['discipline_score'] = round((total_yellow_cards + (total_red_cards * 2)) / total_appearances, 3)
    if total_minutes > 0:
        goal_contributions = total_goals + total_assists
        career_stats['goal_contributions_per_90'] = round((goal_contributions * 90) / total_minutes, 3)
    career_stats['competition_breakdown'] = analyze_competition_experience(player_apps, competition_lookup)
    career_stats['league_experience'] = extract_league_experience(career_stats['competition_breakdown'])
    career_stats['recent_form'] = analyze_recent_form(player_apps)
    career_stats['performance_trends'] = analyze_performance_trends(player_apps)
    return career_stats

def calculate_transfer_history(player_id, dataframes):
    """Calculate transfer history and patterns"""
    if 'transfers' not in dataframes:
        return {}
    player_transfers = dataframes['transfers'][dataframes['transfers']['player_id'] == player_id].sort_values('transfer_date')
    if player_transfers.empty:
        return {}
    current_year = datetime.now().year
    player_transfers = player_transfers[pd.to_datetime(player_transfers['transfer_date'], errors='coerce').dt.year >= (current_year - 10)]
    transfer_fees = player_transfers['transfer_fee'].apply(clean_market_value)
    market_values = player_transfers['market_value_in_eur'].apply(clean_market_value)
    transfer_info = {
        'total_transfers': len(player_transfers), 'total_transfer_fees': float(transfer_fees.sum()),
        'average_transfer_fee': float(transfer_fees.mean()) if len(transfer_fees) > 0 else 0,
        'highest_transfer_fee': float(transfer_fees.max()) if len(transfer_fees) > 0 else 0,
        'transfer_frequency': 0.0, 'career_trajectory': 'unknown', 'recent_transfers': []
    }
    if len(player_transfers) > 1:
        try:
            first_transfer = pd.to_datetime(player_transfers['transfer_date'].iloc[0])
            last_transfer = pd.to_datetime(player_transfers['transfer_date'].iloc[-1])
            years_span = (last_transfer - first_transfer).days / 365.25
            transfer_info['transfer_frequency'] = round(years_span / (len(player_transfers) - 1), 2) if years_span > 0 else 0
        except:
            transfer_info['transfer_frequency'] = 0
    valid_market_values = market_values[market_values > 0]
    if len(valid_market_values) > 1:
        if valid_market_values.iloc[-1] > valid_market_values.iloc[0] * 1.3:
            transfer_info['career_trajectory'] = 'upward'
        elif valid_market_values.iloc[-1] < valid_market_values.iloc[0] * 0.7:
            transfer_info['career_trajectory'] = 'downward'
        else:
            transfer_info['career_trajectory'] = 'stable'
    recent = player_transfers.tail(3)
    for _, transfer in recent.iterrows():
        transfer_info['recent_transfers'].append({
            'date': str(transfer['transfer_date']), 'from_club': clean_club_name(transfer.get('from_club_name', '')),
            'to_club': clean_club_name(transfer.get('to_club_name', '')),
            'fee': float(transfer.get('transfer_fee', 0)) if pd.notna(transfer.get('transfer_fee')) else 0,
            'market_value': float(transfer.get('market_value_in_eur', 0)) if pd.notna(transfer.get('market_value_in_eur')) else 0
        })
    return transfer_info

def calculate_market_value_trends(player_id, dataframes):
    """Calculate market value trends and patterns"""
    if 'player_valuations' not in dataframes:
        return {}
    valuations = dataframes['player_valuations'][dataframes['player_valuations']['player_id'] == player_id].sort_values('date')
    if valuations.empty:
        return {}
    current_year = datetime.now().year
    valuations = valuations[pd.to_datetime(valuations['date'], errors='coerce').dt.year >= (current_year - 5)]
    clean_values = valuations['market_value_in_eur'].apply(clean_market_value)
    clean_values = clean_values[clean_values > 0]
    if len(clean_values) == 0:
        return {}
    market_info = {
        'current_market_value': float(clean_values.iloc[-1]), 'peak_market_value': float(clean_values.max()),
        'lowest_market_value': float(clean_values.min()), 'value_trend': 'stable', 'value_volatility': 0.0,
        'recent_value_change': 0.0, 'value_growth_rate': 0.0
    }
    if len(clean_values) > 1:
        recent_values = clean_values.tail(min(3, len(clean_values)))
        early_values = clean_values.head(min(3, len(clean_values)))
        if len(recent_values) > 0 and len(early_values) > 0:
            recent_avg = recent_values.mean()
            early_avg = early_values.mean()
            trend_change = (recent_avg - early_avg) / early_avg
            if trend_change > 0.15:
                market_info['value_trend'] = 'rising'
            elif trend_change < -0.15:
                market_info['value_trend'] = 'declining'
            else:
                market_info['value_trend'] = 'stable'
            market_info['value_growth_rate'] = round(trend_change, 3)
        market_info['value_volatility'] = round(clean_values.std() / clean_values.mean(), 3)
        if len(clean_values) >= 2:
            market_info['recent_value_change'] = round((clean_values.iloc[-1] - clean_values.iloc[-2]) / clean_values.iloc[-2], 3)
    return market_info

def get_playing_style_indicators(player_id, dataframes, competition_lookup):
    """Derive playing style indicators from performance data"""
    career_stats = calculate_player_career_stats(player_id, dataframes, competition_lookup)
    if not career_stats:
        return {}
    style_indicators = {
        'attacking_threat': 0.0, 'creativity': 0.0, 'consistency': 0.0, 'experience_level': 'unknown',
        'goal_scoring_ability': 'unknown', 'discipline': 'unknown', 'versatility': 0,
        'big_game_experience': False, 'international_experience': False
    }
    style_indicators['attacking_threat'] = career_stats.get('goal_contributions_per_90', 0)
    total_goals = career_stats.get('total_goals', 0)
    total_assists = career_stats.get('total_assists', 0)
    if total_goals + total_assists > 0:
        style_indicators['creativity'] = round(total_assists / (total_goals + total_assists), 3)
    minutes_per_app = career_stats.get('minutes_per_appearance', 0)
    style_indicators['consistency'] = round(min(minutes_per_app / 90, 1.0), 3)
    appearances = career_stats.get('total_appearances', 0)
    if appearances > 250: style_indicators['experience_level'] = 'veteran'
    elif appearances > 100: style_indicators['experience_level'] = 'experienced'
    elif appearances > 50: style_indicators['experience_level'] = 'developing'
    elif appearances > 15: style_indicators['experience_level'] = 'emerging'
    else: style_indicators['experience_level'] = 'young'
    goals_per_game = career_stats.get('goals_per_appearance', 0)
    if goals_per_game > 0.8: style_indicators['goal_scoring_ability'] = 'world_class'
    elif goals_per_game > 0.6: style_indicators['goal_scoring_ability'] = 'prolific'
    elif goals_per_game > 0.4: style_indicators['goal_scoring_ability'] = 'regular'
    elif goals_per_game > 0.2: style_indicators['goal_scoring_ability'] = 'occasional'
    elif goals_per_game > 0.05: style_indicators['goal_scoring_ability'] = 'rare'
    else: style_indicators['goal_scoring_ability'] = 'non_scorer'
    discipline_score = career_stats.get('discipline_score', 0)
    if discipline_score > 0.4: style_indicators['discipline'] = 'poor'
    elif discipline_score > 0.25: style_indicators['discipline'] = 'questionable'
    elif discipline_score > 0.15: style_indicators['discipline'] = 'average'
    elif discipline_score > 0.05: style_indicators['discipline'] = 'good'
    else: style_indicators['discipline'] = 'excellent'
    comp_breakdown = career_stats.get('competition_breakdown', [])
    style_indicators['versatility'] = len(comp_breakdown)
    league_exp = career_stats.get('league_experience', {})
    european_comps = league_exp.get('european_competitions', [])
    top_5_leagues = league_exp.get('top_5_leagues', [])
    style_indicators['big_game_experience'] = len(european_comps) > 0 or len(top_5_leagues) > 0
    style_indicators['international_experience'] = len(top_5_leagues) > 0
    return style_indicators

def create_scouting_text(profile):
    """Create rich text description for embedding"""
    text_parts = []
    name = profile.get('name', 'Unknown Player')
    age = profile.get('age', 0)
    nationality = profile.get('nationality', 'Unknown')
    position = profile.get('position', 'Unknown')
    sub_position = profile.get('sub_position', '')
    position_desc = f"{position}"
    if sub_position and sub_position != position and sub_position.lower() != 'nan':
        position_desc += f" ({sub_position})"
    text_parts.append(f"{name} is a {age}-year-old {position_desc} from {nationality}")
    club = profile.get('current_club_name', '')
    if club and club.lower() != 'nan':
        text_parts.append(f"currently playing for {club}")
    height = profile.get('height_cm', 0)
    foot = profile.get('preferred_foot', '')
    physical_desc = []
    if height > 0:
        if height > 190: physical_desc.append(f"tall {height}cm")
        elif height < 170: physical_desc.append(f"compact {height}cm")
        else: physical_desc.append(f"{height}cm")
    if foot and foot.lower() not in ['nan', '']: physical_desc.append(f"{foot}-footed")
    if physical_desc: text_parts.append(f"Physical: {', '.join(physical_desc)}")
    career_stats = profile.get('career_stats', {})
    if career_stats:
        appearances = career_stats.get('total_appearances', 0)
        goals = career_stats.get('total_goals', 0)
        assists = career_stats.get('total_assists', 0)
        if appearances > 0:
            text_parts.append(f"Career: {appearances} apps, {goals} goals, {assists} assists")
            goals_per_game = career_stats.get('goals_per_appearance', 0)
            assists_per_game = career_stats.get('assists_per_appearance', 0)
            goal_contrib_per_90 = career_stats.get('goal_contributions_per_90', 0)
            performance_desc = f"{goals_per_game:.2f} goals/game, {assists_per_game:.2f} assists/game"
            if goal_contrib_per_90 > 0: performance_desc += f", {goal_contrib_per_90:.2f} contributions/90min"
            text_parts.append(f"Performance: {performance_desc}")
    playing_style = profile.get('playing_style', {})
    if playing_style:
        experience = playing_style.get('experience_level', '')
        goal_ability = playing_style.get('goal_scoring_ability', '')
        discipline = playing_style.get('discipline', '')
        big_game_exp = playing_style.get('big_game_experience', False)
        style_desc = []
        if experience and experience != 'unknown': style_desc.append(f"{experience} player")
        if goal_ability and goal_ability != 'unknown': style_desc.append(f"{goal_ability} goalscorer")
        if discipline and discipline != 'unknown': style_desc.append(f"{discipline} discipline")
        if big_game_exp: style_desc.append("big game experience")
        if style_desc: text_parts.append(f"Profile: {', '.join(style_desc)}")
    league_exp = career_stats.get('league_experience', {})
    if league_exp:
        top_5_leagues = league_exp.get('top_5_leagues', [])
        european_comps = league_exp.get('european_competitions', [])
        experience_desc = []
        if top_5_leagues:
            leagues = [league['league'] for league in top_5_leagues]
            experience_desc.append(f"Top 5 leagues: {', '.join(leagues)}")
        if european_comps:
            comps = [comp['competition'] for comp in european_comps]
            experience_desc.append(f"European: {', '.join(comps)}")
        if experience_desc: text_parts.append(f"Experience: {'; '.join(experience_desc)}")
    current_value = profile.get('current_market_value', 0)
    peak_value = profile.get('highest_market_value', 0)
    if current_value > 0:
        if current_value >= 100_000_000: text_parts.append(f"Elite market value: €{current_value:,}")
        elif current_value >= 50_000_000: text_parts.append(f"High market value: €{current_value:,}")
        elif current_value >= 10_000_000: text_parts.append(f"Significant market value: €{current_value:,}")
        else: text_parts.append(f"Market value: €{current_value:,}")
    if peak_value > current_value * 1.5 and peak_value > 0: text_parts.append(f"Peak value: €{peak_value:,}")
    market_trends = profile.get('market_value_trends', {})
    if market_trends:
        trend = market_trends.get('value_trend', '')
        if trend == 'rising': text_parts.append("Value trending upward")
        elif trend == 'declining': text_parts.append("Value declining")
    contract_end = profile.get('contract_expiration', '')
    if contract_end and contract_end != '' and 'nan' not in str(contract_end).lower():
        try:
            contract_date = pd.to_datetime(contract_end)
            current_date = pd.Timestamp.now()
            months_remaining = (contract_date - current_date).days / 30
            if months_remaining < 6: text_parts.append("Contract expiring soon")
            elif months_remaining < 18: text_parts.append("Contract expires within 18 months")
        except: pass
    agent = profile.get('agent_name', '')
    if agent and agent != '' and 'nan' not in str(agent).lower(): text_parts.append(f"Agent: {agent}")
    return '. '.join(text_parts) + '.'

In [16]:
DATA_DIR = Path("data")  # Adjust this path as needed for your data files
OUTPUT_DIR = Path("embeddings") # Adjust this path as needed for your output
MODEL_NAME = "all-MiniLM-L6-v2"
LIMIT_PLAYERS = None # Set to an integer like 1000 for testing, or None for all players

OUTPUT_DIR.mkdir(exist_ok=True)

dataframes = {}
csv_files = {
    'players': 'players.csv',
    'appearances': 'appearances.csv', 
    'clubs': 'clubs.csv',
    'competitions': 'competitions.csv',
    'games': 'games.csv',
    'transfers': 'transfers.csv',
    'player_valuations': 'player_valuations.csv',
    'game_events': 'game_events.csv',
    'game_lineups': 'game_lineups.csv',
    'club_games': 'club_games.csv'
}

for name, filename in csv_files.items():
    file_path = DATA_DIR / filename
    if file_path.exists():
        try:
            df = pd.read_csv(file_path)
            if name == 'players':
                df = clean_players_data(df)
            elif name == 'clubs':
                df = clean_clubs_data(df)
            dataframes[name] = df
            logger.info(f"Loaded {len(df)} records from {filename}")
        except Exception as e:
            logger.error(f"Failed loading {filename}: {e}")
logger.info(f"Loaded {len(dataframes)} datasets")

# looking up competitoins
competition_lookup = {}
if 'competitions' in dataframes:
    for _, row in dataframes['competitions'].iterrows():
        comp_id = row.get('competition_id', '')
        comp_name = row.get('name', '')
        comp_code = row.get('competition_code', '')
        if comp_id:
            competition_lookup[comp_id] = {
                'name': comp_name,
                'code': comp_code,
                'type': row.get('type', ''),
                'country': row.get('country_name', ''),
                'is_major': row.get('is_major_national_league', False)
            }
    logger.info(f"Created lookup for {len(competition_lookup)} competitions")

# filtering players
if 'players' not in dataframes:
    logger.error("Players dataset not loaded! Cannot filter current players.")
else:
    original_player_count = len(dataframes['players'])
    current_players_df = dataframes['players'][dataframes['players']['last_season'] == 2024].copy()
    current_players_df = current_players_df[
        (current_players_df['name'].notna()) & 
        (current_players_df['name'] != '') &
        (current_players_df['position'].notna()) &
        (current_players_df['position'] != '')
    ].copy()
    if 'date_of_birth' in current_players_df.columns:
        current_players_df = current_players_df[
            (current_players_df['date_of_birth'].isna()) |  # Keep if no DOB data
            (pd.to_datetime(current_players_df['date_of_birth'], errors='coerce').notna())  # Valid dates only
        ].copy()
    if 'market_value_in_eur' in current_players_df.columns:
        current_players_df = current_players_df[
            (current_players_df['market_value_in_eur'].isna()) |  # Keep if no market value data
            (current_players_df['market_value_in_eur'] >= 25000)  # Minimum €25k market value
        ].copy()
    dataframes['players'] = current_players_df # Update the players dataframe to only contain current players
    logger.info(f"Filtered players: {original_player_count:,} → {len(current_players_df):,} currently active players")
    logger.info(f"Removed {original_player_count - len(current_players_df):,} retired/inactive players")

2025-06-26 14:53:09,195 - INFO - Loaded 32601 records from players.csv
2025-06-26 14:53:10,260 - INFO - Loaded 1706806 records from appearances.csv
2025-06-26 14:53:10,267 - INFO - Loaded 439 records from clubs.csv
2025-06-26 14:53:10,268 - INFO - Loaded 44 records from competitions.csv
2025-06-26 14:53:10,432 - INFO - Loaded 74026 records from games.csv
2025-06-26 14:53:10,484 - INFO - Loaded 79646 records from transfers.csv
2025-06-26 14:53:10,578 - INFO - Loaded 496606 records from player_valuations.csv
2025-06-26 14:53:11,269 - INFO - Loaded 1035043 records from game_events.csv
2025-06-26 14:53:13,112 - INFO - Loaded 2191911 records from game_lineups.csv
2025-06-26 14:53:13,194 - INFO - Loaded 148052 records from club_games.csv
2025-06-26 14:53:13,194 - INFO - Loaded 10 datasets
2025-06-26 14:53:13,196 - INFO - Created lookup for 44 competitions
2025-06-26 14:53:13,258 - INFO - Filtered players: 32,601 → 6,296 currently active players
2025-06-26 14:53:13,259 - INFO - Removed 26,305

In [17]:
logger.info(f"Creating comprehensive profiles for {len(dataframes['players']):,} players...")

processed_players = []
error_count = 0

players_to_process = dataframes['players'].head(LIMIT_PLAYERS) if LIMIT_PLAYERS else dataframes['players']

for idx, (_, player_row) in enumerate(players_to_process.iterrows()):
    if idx % 1000 == 0:
        logger.info(f"Progress: {idx:,}/{len(players_to_process):,} players processed ({len(processed_players):,} successful)")
    
    try:
        player_id = player_row['player_id']
        
        player_info = dataframes['players'][dataframes['players']['player_id'] == player_id].iloc[0]
        
        profile = {
            'player_id': int(player_id),
            'name': str(player_info.get('name', '')).strip(),
            'first_name': str(player_info.get('first_name', '')).strip(),
            'last_name': str(player_info.get('last_name', '')).strip(),
            'date_of_birth': str(player_info.get('date_of_birth', '')),
            'age': calculate_age(player_info.get('date_of_birth')),
            'nationality': str(player_info.get('country_of_citizenship', '')).strip(),
            'birth_place': clean_birth_place(player_info),
            'position': str(player_info.get('position', '')).strip(),
            'sub_position': str(player_info.get('sub_position', '')).strip(),
            'preferred_foot': str(player_info.get('foot', '')).strip(),
            'height_cm': float(player_info.get('height_in_cm', 0)) if pd.notna(player_info.get('height_in_cm')) else 0,
            'current_club_id': player_info.get('current_club_id'),
            'current_club_name': clean_club_name(player_info.get('current_club_name', '')),
            'contract_expiration': str(player_info.get('contract_expiration_date', '')),
            'agent_name': str(player_info.get('agent_name', '')).strip(),
            'current_market_value': clean_market_value(player_info.get('market_value_in_eur', 0)),
            'highest_market_value': clean_market_value(player_info.get('highest_market_value_in_eur', 0)),
            'career_stats': calculate_player_career_stats(player_id, dataframes, competition_lookup),
            'transfer_history': calculate_transfer_history(player_id, dataframes),
            'market_value_trends': calculate_market_value_trends(player_id, dataframes),
            'playing_style': get_playing_style_indicators(player_id, dataframes, competition_lookup),
        }
        profile['club_context'] = get_club_context(profile['current_club_id'], dataframes)
        
        if profile and profile.get('name'):
            career_stats = profile.get('career_stats', {})
            appearances = career_stats.get('total_appearances', 0)
            
            if appearances >= 5 or profile.get('current_market_value', 0) >= 100000:
                embedding_text = create_scouting_text(profile)
                player_record = {
                    'player_id': int(player_id),
                    'name': profile.get('name', ''),
                    'position': profile.get('position', ''),
                    'age': profile.get('age', 0),
                    'nationality': profile.get('nationality', ''),
                    'current_club': profile.get('current_club_name', ''),
                    'market_value': profile.get('current_market_value', 0),
                    'embedding_text': embedding_text,
                    'metadata': profile
                }
                processed_players.append(player_record)
                
    except Exception as e:
        error_count += 1
        if error_count <= 10:
            logger.warning(f"Error processing player {player_row.get('player_id', 'unknown')}: {e}")
        continue

if error_count > 10:
    logger.warning(f"Total errors: {error_count} (showing first 10 only)")

logger.info(f"Successfully created {len(processed_players):,} comprehensive player profiles")

# Display a sample profile
if processed_players:
    print("\nSample Player Profile:")
    print(json.dumps(processed_players[0], indent=2, default=str))
else:
    logger.error("No player profiles generated.")

2025-06-26 14:53:13,269 - INFO - Creating comprehensive profiles for 6,296 players...
2025-06-26 14:53:13,272 - INFO - Progress: 0/6,296 players processed (0 successful)
2025-06-26 14:53:19,340 - INFO - Progress: 1,000/6,296 players processed (997 successful)
2025-06-26 14:53:25,162 - INFO - Progress: 2,000/6,296 players processed (1,992 successful)
2025-06-26 14:53:30,455 - INFO - Progress: 3,000/6,296 players processed (2,987 successful)
2025-06-26 14:53:35,339 - INFO - Progress: 4,000/6,296 players processed (3,973 successful)
2025-06-26 14:53:40,176 - INFO - Progress: 5,000/6,296 players processed (4,944 successful)
2025-06-26 14:53:44,576 - INFO - Progress: 6,000/6,296 players processed (5,883 successful)
2025-06-26 14:53:45,885 - INFO - Successfully created 6,148 comprehensive player profiles



Sample Player Profile:
{
  "player_id": 3333,
  "name": "James Milner",
  "position": "Midfield",
  "age": 39,
  "nationality": "England",
  "current_club": "Brighton and Hove Albion Football Club",
  "market_value": 1000000,
  "embedding_text": "James Milner is a 39-year-old Midfield (Central Midfield) from England. currently playing for Brighton and Hove Albion Football Club. Physical: 175.0cm, right-footed. Career: 146 apps, 0 goals, 6 assists. Performance: 0.00 goals/game, 0.04 assists/game, 0.10 contributions/90min. Profile: experienced player, non_scorer goalscorer, good discipline, big game experience. Experience: Top 5 leagues: premier league; European: champions league, europa league. Market value: \u20ac1,000,000. Peak value: \u20ac21,000,000. Value declining. Contract expiring soon.",
  "metadata": {
    "player_id": 3333,
    "name": "James Milner",
    "first_name": "James",
    "last_name": "Milner",
    "date_of_birth": "1986-01-04 00:00:00",
    "age": 39,
    "nationa

In [18]:
# initilaize embedding model
logger.info(f"Loading embedding model: {MODEL_NAME}")
embedding_model = SentenceTransformer(MODEL_NAME)

# create embeddings
logger.info(f"Generating embeddings for {len(processed_players):,} players...")

texts_for_embedding = []
valid_indices_for_embedding = []

for i, record in enumerate(processed_players):
    text = record.get('embedding_text', '').strip()
    if text and len(text) > 10:
        texts_for_embedding.append(text)
        valid_indices_for_embedding.append(i)
    else:
        logger.warning(f"Skipping player {record.get('name', 'unknown')} - insufficient text for embedding")

logger.info(f"Processing {len(texts_for_embedding):,} valid texts for embedding")

if not texts_for_embedding:
    logger.error("No valid texts to process for embedding generation.")
else:
    batch_size = 32
    all_embeddings = []
    
    try:
        for i in range(0, len(texts_for_embedding), batch_size):
            batch_texts = texts_for_embedding[i:i + batch_size]
            batch_num = i // batch_size + 1
            total_batches = (len(texts_for_embedding) + batch_size - 1) // batch_size
            
            logger.info(f"Processing embedding batch {batch_num}/{total_batches}")
            
            try:
                batch_embeddings = embedding_model.encode(
                    batch_texts,
                    show_progress_bar=False,
                    batch_size=min(16, len(batch_texts)),
                    convert_to_numpy=True,
                    normalize_embeddings=True
                )
                all_embeddings.extend(batch_embeddings.tolist())
                
            except Exception as e:
                logger.error(f"Embedding batch {batch_num} failed: {e}")
                embedding_dim = 384 # Assuming this is the dimension of all-MiniLM-L6-v2
                for _ in batch_texts:
                    all_embeddings.append([0.0] * embedding_dim)
        
        embedding_idx = 0
        for record_idx in valid_indices_for_embedding:
            if embedding_idx < len(all_embeddings):
                processed_players[record_idx]['embedding'] = all_embeddings[embedding_idx]
                embedding_idx += 1
            else:
                # Fallback for any records that somehow missed an embedding
                processed_players[record_idx]['embedding'] = [0.0] * 384
        
        logger.info(f"Generated embeddings for {len(all_embeddings):,} players")
        
    except Exception as e:
        logger.error(f"Overall embedding generation failed: {e}")

2025-06-26 14:53:45,896 - INFO - Loading embedding model: all-MiniLM-L6-v2
2025-06-26 14:53:45,925 - INFO - Use pytorch device_name: mps
2025-06-26 14:53:45,926 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-06-26 14:53:47,927 - INFO - Generating embeddings for 6,148 players...
2025-06-26 14:53:47,937 - INFO - Processing 6,148 valid texts for embedding
2025-06-26 14:53:47,937 - INFO - Processing embedding batch 1/193
2025-06-26 14:53:48,242 - INFO - Processing embedding batch 2/193
2025-06-26 14:53:48,487 - INFO - Processing embedding batch 3/193
2025-06-26 14:53:48,675 - INFO - Processing embedding batch 4/193
2025-06-26 14:53:48,835 - INFO - Processing embedding batch 5/193
2025-06-26 14:53:49,009 - INFO - Processing embedding batch 6/193
2025-06-26 14:53:49,190 - INFO - Processing embedding batch 7/193
2025-06-26 14:53:49,341 - INFO - Processing embedding batch 8/193
2025-06-26 14:53:49,478 - INFO - Processing embedding batch 9/193
2025-06-26 14:53:49,616 - INFO

In [19]:
# saving embeddings and metadata
logger.info("Saving embeddings and metadata...")

valid_records_with_embeddings = [
    record for record in processed_players 
    if 'embedding' in record and record['embedding'] and len(record['embedding']) > 0
]

if not valid_records_with_embeddings:
    logger.error("No valid embeddings to save.")
else:
    logger.info(f"Saving {len(valid_records_with_embeddings):,} valid player records with embeddings")
    
    # embeddings as an array
    embeddings_array = np.array([record['embedding'] for record in valid_records_with_embeddings])
    np.save(OUTPUT_DIR / "player_embeddings.npy", embeddings_array)
    
    # metadata
    metadata = []
    for record in valid_records_with_embeddings:
        metadata.append({
            'player_id': record['player_id'],
            'name': record['name'],
            'position': record['position'],
            'age': record['age'],
            'nationality': record['nationality'],
            'current_club': record['current_club'],
            'market_value': record['market_value'],
            'embedding_text': record['embedding_text']
        })
    
    # json save
    with open(OUTPUT_DIR / "player_metadata.json", 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)
    
    # csv save
    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv(OUTPUT_DIR / "player_metadata.csv", index=False)
    
    # Save detailed profiles
    detailed_profiles = [record['metadata'] for record in valid_records_with_embeddings]
    with open(OUTPUT_DIR / "detailed_player_profiles.json", 'w', encoding='utf-8') as f:
        json.dump(detailed_profiles, f, indent=2, ensure_ascii=False, default=str)
    
    # summary stats for players
    summary_stats = {
        'total_players': len(valid_records_with_embeddings),
        'embedding_dimension': len(valid_records_with_embeddings[0]['embedding']) if valid_records_with_embeddings else 0,
        'data_quality': {
            'avg_text_length': np.mean([len(r['embedding_text']) for r in valid_records_with_embeddings]),
            'players_with_stats': len([r for r in valid_records_with_embeddings if r['metadata'].get('career_stats', {}).get('total_appearances', 0) > 0]),
            'players_with_market_value': len([r for r in valid_records_with_embeddings if r['market_value'] > 0]),
            'active_players_only': True,
            'last_season_filter': 2024
        },
        'position_distribution': metadata_df['position'].value_counts().to_dict(),
        'top_clubs': metadata_df['current_club'].value_counts().head(20).to_dict(),
        'age_distribution': {
            'mean': round(metadata_df['age'].mean(), 1),
            'median': int(metadata_df['age'].median()),
            'min': int(metadata_df['age'].min()),
            'max': int(metadata_df['age'].max()),
            'std': round(metadata_df['age'].std(), 1)
        },
        'market_value_stats': {
            'mean': round(metadata_df['market_value'].mean(), 0),
            'median': round(metadata_df['market_value'].median(), 0),
            'max': int(metadata_df['market_value'].max()),
            'players_over_1m': len(metadata_df[metadata_df['market_value'] > 1_000_000]),
            'players_over_10m': len(metadata_df[metadata_df['market_value'] > 10_000_000]),
            'players_over_50m': len(metadata_df[metadata_df['market_value'] > 50_000_000])
        },
        'nationality_distribution': metadata_df['nationality'].value_counts().head(20).to_dict(),
        'created_at': datetime.now().isoformat(),
        'model_used': MODEL_NAME
    }
    
    # save summary stats as json
    with open(OUTPUT_DIR / "embedding_summary.json", 'w') as f:
        json.dump(summary_stats, f, indent=2, default=str)
    
    logger.info(f"Summary: {summary_stats['total_players']:,} current players, {summary_stats['embedding_dimension']} dimensions")
    logger.info(f"Data quality: {summary_stats['data_quality']['players_with_stats']:,} players with stats")
    logger.info(f"Market values: {summary_stats['market_value_stats']['players_over_1m']:,} players over €1M")
    logger.info("Embedding generation and saving complete.")

# check if it work
if valid_records_with_embeddings:
    print("\nFirst 5 final player records (with embeddings):")
    for i, record in enumerate(valid_records_with_embeddings[:5]):
        print(f"Player ID: {record['player_id']}, Name: {record['name']}, Embedding Length: {len(record.get('embedding', []))}")
        if 'embedding_text' in record:
            print(f"  Embedding Text: {record['embedding_text'][:100]}...") # Print first 100 chars
        print("-" * 20)
else:
    print("\nNo valid player embeddings were saved.")

2025-06-26 14:54:09,510 - INFO - Saving embeddings and metadata...
2025-06-26 14:54:09,514 - INFO - Saving 6,148 valid player records with embeddings
2025-06-26 14:54:10,324 - INFO - Summary: 6,148 current players, 384 dimensions
2025-06-26 14:54:10,324 - INFO - Data quality: 5,968 players with stats
2025-06-26 14:54:10,324 - INFO - Market values: 3,469 players over €1M
2025-06-26 14:54:10,324 - INFO - Embedding generation and saving complete.



First 5 final player records (with embeddings):
Player ID: 3333, Name: James Milner, Embedding Length: 384
  Embedding Text: James Milner is a 39-year-old Midfield (Central Midfield) from England. currently playing for Bright...
--------------------
Player ID: 5336, Name: Anastasios Tsokanis, Embedding Length: 384
  Embedding Text: Anastasios Tsokanis is a 34-year-old Midfield (Defensive Midfield) from Greece. currently playing fo...
--------------------
Player ID: 7161, Name: Jonas Hofmann, Embedding Length: 384
  Embedding Text: Jonas Hofmann is a 32-year-old Midfield (Attacking Midfield) from Germany. currently playing for Bay...
--------------------
Player ID: 7825, Name: Pepe Reina, Embedding Length: 384
  Embedding Text: Pepe Reina is a 42-year-old Goalkeeper from Spain. currently playing for Calcio Como. Physical: 188....
--------------------
Player ID: 11530, Name: Lionel Carole, Embedding Length: 384
  Embedding Text: Lionel Carole is a 34-year-old Defender (Left-Back) from F

# Part 2) RAG System

## Helper Functions for rag
- helps match up league/competition names and some cleaning of club names to match 

In [20]:
def initialize_league_mappings() -> Dict[str, List[str]]:
    """Initialize comprehensive league mappings based on actual competition data."""
    # This mapping is crucial for parsing league-related queries.
    return {
        'top 5 leagues': ['GB1', 'ES1', 'L1', 'IT1', 'FR1'],
        'big 5 leagues': ['GB1', 'ES1', 'L1', 'IT1', 'FR1'],
        'top leagues': ['GB1', 'ES1', 'L1', 'IT1', 'FR1'],
        'major leagues': ['GB1', 'ES1', 'L1', 'IT1', 'FR1'],
        'big five': ['GB1', 'ES1', 'L1', 'IT1', 'FR1'],
        'european top leagues': ['GB1', 'ES1', 'L1', 'IT1', 'FR1'],
        
        'premier league': ['GB1', 'premier-league'],
        'epl': ['GB1'], 'english premier league': ['GB1'], 'premier': ['GB1'],
        'england': ['GB1'], 'english league': ['GB1'],
        
        'la liga': ['ES1', 'laliga'],
        'spanish league': ['ES1'], 'spain': ['ES1'], 'laliga': ['ES1'], 'spanish primera': ['ES1'],
        
        'bundesliga': ['L1', 'bundesliga'],
        'german league': ['L1'], 'germany': ['L1'], 'german bundesliga': ['L1'],
        
        'serie a': ['IT1', 'serie-a'],
        'italian league': ['IT1'], 'italy': ['IT1'], 'italian serie a': ['IT1'],
        
        'ligue 1': ['FR1', 'ligue-1'],
        'french league': ['FR1'], 'france': ['FR1'], 'ligue un': ['FR1'],
        
        'eredivisie': ['NL1', 'eredivisie'],
        'dutch league': ['NL1'], 'netherlands': ['NL1'],
        
        'primeira liga': ['PO1', 'liga-portugal-bwin'],
        'portuguese league': ['PO1'], 'portugal': ['PO1'],
        
        'super lig': ['TR1', 'super-lig'],
        'turkish league': ['TR1'], 'turkey': ['TR1'],
        
        'scottish premiership': ['SC1', 'scottish-premiership'],
        'scottish league': ['SC1'], 'scotland': ['SC1'],
        
        'belgian pro league': ['BE1', 'jupiler-pro-league'],
        'belgian league': ['BE1'], 'belgium': ['BE1'],
        
        'super league greece': ['GR1', 'super-league-1'],
        'greek league': ['GR1'], 'greece': ['GR1'],
        
        'superligaen': ['DK1', 'superligaen'],
        'danish league': ['DK1'], 'denmark': ['DK1'],
        
        'premier liga ukraine': ['UKR1', 'premier-liga'],
        'ukrainian league': ['UKR1'], 'ukraine': ['UKR1'],
        
        'russian premier league': ['RU1', 'premier-liga'],
        'russian league': ['RU1'], 'russia': ['RU1'],
        
        'champions league': ['CL', 'uefa-champions-league'],
        'ucl': ['CL'],
        'europa league': ['EL', 'uefa-europa-league'],
        'uel': ['EL'],
        'conference league': ['UCOL', 'uefa-conference-league'],
        'european cups': ['CL', 'EL', 'UCOL']
    }

def clean_market_value_string(value_str: str) -> int:
    """Clean market value strings from queries to integer euros."""
    if not value_str:
        return 0
    try:
        value_str = re.sub(r'[€$£¥]', '', str(value_str))
        value_str = re.sub(r'[+\-±]', '', value_str)
        value_str = value_str.replace(',', '').strip()
        
        if 'm' in value_str.lower() or 'million' in value_str.lower():
            value_str = re.sub(r'(million|mil|m)', '', value_str.lower())
            multiplier = 1_000_000
        elif 'k' in value_str.lower() or 'thousand' in value_str.lower():
            value_str = re.sub(r'(thousand|k)', '', value_str.lower())
            multiplier = 1_000
        else:
            multiplier = 1
        
        numeric_match = re.search(r'(\d+(?:\.\d+)?)', value_str)
        if numeric_match:
            numeric_value = float(numeric_match.group(1))
            return int(numeric_value * multiplier)
        
        return 0
    except (ValueError, TypeError):
        return 0

# club_league_mapping is also a static lookup
club_league_mapping = {
    'manchester': 'gb1', 'liverpool': 'gb1', 'arsenal': 'gb1', 'chelsea': 'gb1',
    'tottenham': 'gb1', 'manchester city': 'gb1', 'manchester united': 'gb1',
    'newcastle': 'gb1', 'brighton': 'gb1', 'aston villa': 'gb1', 'west ham': 'gb1',
    'everton': 'gb1', 'crystal palace': 'gb1', 'fulham': 'gb1', 'brentford': 'gb1',
    'wolverhampton': 'gb1', 'nottingham': 'gb1', 'bournemouth': 'gb1',
    'sunderland': 'gb1', 'burnley': 'gb1', 'leeds': 'gb1',
    'real madrid': 'es1', 'barcelona': 'es1', 'atletico': 'es1', 'sevilla': 'es1',
    'villarreal': 'es1', 'real sociedad': 'es1', 'athletic': 'es1', 'valencia': 'es1',
    'betis': 'es1', 'osasuna': 'es1', 'getafe': 'es1', 'alaves': 'es1',
    'rayo vallecano': 'es1', 'mallorca': 'es1', 'las palmas': 'es1', 'cadiz': 'es1',
    'celta vigo': 'es1', 'espanyol': 'es1', 'leganes': 'es1', 'valladolid': 'es1',
    'bayern': 'l1', 'dortmund': 'l1', 'leipzig': 'l1', 'bayer leverkusen': 'l1',
    'eintracht frankfurt': 'l1', 'wolfsburg': 'l1', 'borussia': 'l1', 'stuttgart': 'l1',
    'hoffenheim': 'l1', 'mainz': 'l1', 'augsburg': 'l1', 'heidenheim': 'l1',
    'werder bremen': 'l1', 'freiburg': 'l1', 'union berlin': 'l1', 'cologne': 'l1',
    'hertha': 'l1', 'schalke': 'l1', 'hamburg': 'l1', 'hannover': 'l1',
    'juventus': 'it1', 'milan': 'it1', 'inter': 'it1', 'napoli': 'it1',
    'roma': 'it1', 'lazio': 'it1', 'atalanta': 'it1', 'fiorentina': 'it1',
    'bologna': 'it1', 'torino': 'it1', 'udinese': 'it1', 'sampdoria': 'it1',
    'genoa': 'it1', 'cagliari': 'it1', 'lecce': 'it1', 'verona': 'it1',
    'empoli': 'it1', 'monza': 'it1', 'como': 'it1', 'parma': 'it1',
    'paris': 'fr1', 'marseille': 'fr1', 'lyon': 'fr1', 'monaco': 'fr1',
    'nice': 'fr1', 'lille': 'fr1', 'rennes': 'fr1', 'strasbourg': 'fr1',
    'montpellier': 'fr1', 'nantes': 'fr1', 'toulouse': 'fr1', 'reims': 'fr1',
    'lens': 'fr1', 'brest': 'fr1', 'angers': 'fr1', 'lorient': 'fr1'
}

## query parsing

In [21]:
def parse_query(query: str, league_mappings: Dict[str, List[str]]) -> Dict[str, Any]:
    """Enhanced query parser with comprehensive filter support."""
    query_lower = query.lower().strip()
    filters = {}
    
    logger.info(f"Parsing query: '{query}'")
    
    position_mappings = {
        'centre-forward': ['centre-forward', 'center-forward', 'striker', 'cf', 'st', 'forward'],
        'second striker': ['second striker', 'support striker', 'false 9', 'ss', 'false nine'],
        'left winger': ['left winger', 'left wing', 'lw', 'left-winger'],
        'right winger': ['right winger', 'right wing', 'rw', 'right-winger'],
        'attacking midfield': ['attacking midfielder', 'attacking midfield', 'cam', 'playmaker', 'am', 'number 10', '10'],
        'central midfield': ['central midfielder', 'central midfield', 'midfielder', 'midfield', 'cm', 'box to box'],
        'defensive midfield': ['defensive midfielder', 'defensive midfield', 'cdm', 'holding midfielder', 'dm', '6', 'anchor'],
        'left midfield': ['left midfielder', 'left midfield', 'lm'],
        'right midfield': ['right midfielder', 'right midfield', 'rm'],
        'centre-back': ['centre-back', 'center-back', 'central defender', 'cb', 'centre back', 'center back'],
        'left-back': ['left-back', 'left back', 'lb', 'left back'],
        'right-back': ['right-back', 'right back', 'rb', 'right back'],
        'goalkeeper': ['goalkeeper', 'keeper', 'gk', 'goalie', 'shot stopper'],
        'forward': ['forward', 'striker', 'attacker', 'attack'],
        'winger': ['winger', 'wing', 'wide player', 'wide forward'],
        'midfielder': ['midfielder', 'midfield', 'middle'],
        'defender': ['defender', 'defence', 'defense', 'back'],
    }
    
    position_found = False
    for position_key, position_terms in position_mappings.items():
        if not position_found:
            for term in position_terms:
                if term in query_lower:
                    filters['position'] = position_key
                    logger.info(f"Detected position: {position_key}")
                    position_found = True
                    break
    
    for league_term, league_list in league_mappings.items():
        if league_term in query_lower:
            filters['target_leagues'] = league_list
            logger.info(f"Detected league filter: {league_term} -> {league_list}")
            break
    
    nationality_mappings = {
        'brazilian': 'Brazil', 'argentinian': 'Argentina', 'argentine': 'Argentina',
        'spanish': 'Spain', 'german': 'Germany', 'french': 'France', 'english': 'England',
        'italian': 'Italy', 'portuguese': 'Portugal', 'dutch': 'Netherlands',
        'belgian': 'Belgium', 'croatian': 'Croatia', 'serbian': 'Serbia',
        'polish': 'Poland', 'mexican': 'Mexico', 'colombian': 'Colombia',
        'american': 'United States', 'canadian': 'Canada', 'turkish': 'Turkey', 
        'greek': 'Greece', 'danish': 'Denmark', 'swedish': 'Sweden', 
        'norwegian': 'Norway', 'ukrainian': 'Ukraine', 'russian': 'Russia', 
        'austrian': 'Austria', 'swiss': 'Switzerland', 'uruguayan': 'Uruguay',
        'chilean': 'Chile', 'peruvian': 'Peru', 'japanese': 'Japan',
        'korean': 'South Korea', 'south korean': 'South Korea'
    }
    
    nationality_found = False
    for nat_term, nat_country in nationality_mappings.items():
        if nat_term in query_lower:
            filters['nationality'] = nat_country
            nationality_found = True
            logger.info(f"Detected nationality (adjective): {nat_term} -> {nat_country}")
            break
    
    if not nationality_found:
        country_names = [
            'brazil', 'argentina', 'spain', 'germany', 'france', 'england',
            'italy', 'portugal', 'netherlands', 'belgium', 'croatia', 'serbia',
            'poland', 'mexico', 'colombia', 'united states', 'usa', 'canada', 
            'turkey', 'greece', 'denmark', 'sweden', 'norway', 'ukraine', 
            'russia', 'uruguay', 'chile', 'peru', 'japan', 'south korea'
        ]
        
        for country in country_names:
            if country in query_lower:
                if country == 'usa':
                    filters['nationality'] = 'United States'
                elif country == 'south korea':
                    filters['nationality'] = 'South Korea'
                elif country == 'united states':
                    filters['nationality'] = 'United States'
                else:
                    filters['nationality'] = country.title()
                logger.info(f"Detected nationality (country): {country} -> {filters['nationality']}")
                break
    
    age_patterns = [
        r'under (\d+)', r'younger than (\d+)', r'below (\d+)', r'less than (\d+)',
        r'between (\d+) and (\d+)', r'(\d+) to (\d+) years?', r'(\d+)-(\d+) years?',
        r'(\d+) years? old', r'age (\d+)', r'aged (\d+)', r'over (\d+)', r'above (\d+)',
        r'older than (\d+)', r'more than (\d+) years?', r'(\d+)\+', r'(\d+) plus'
    ]
    
    value_extracted = False
    value_patterns = [
        r'under (\d+(?:\.\d+)?)\s*(?:million|mil|m)(?:\s*euros?|\s*€)?',
        r'below (\d+(?:\.\d+)?)\s*(?:million|mil|m)(?:\s*euros?|\s*€)?',
        r'less than (\d+(?:\.\d+)?)\s*(?:million|mil|m)(?:\s*euros?|\s*€)?',
        r'budget.*?(\d+(?:\.\d+)?)\s*(?:million|mil|m)(?:\s*euros?|\s*€)?',
        r'(\d+(?:\.\d+)?)m?\s*budget',
        r'(\d+(?:\.\d+)?)\s*(?:million|mil|m)(?:\s*euros?|\s*€)?',
        r'€(\d+(?:\.\d+)?)\s*(?:million|mil|m)?',
        r'\$(\d+(?:\.\d+)?)\s*(?:million|mil|m)?',
        r'£(\d+(?:\.\d+)?)\s*(?:million|mil|m)?',
        r'cheap.*?(\d+(?:\.\d+)?)\s*(?:million|mil|m)(?:\s*euros?|\s*€)?',
        r'bargain.*?(\d+(?:\.\d+)?)\s*(?:million|mil|m)(?:\s*euros?|\s*€)?',
        r'affordable.*?(\d+(?:\.\d+)?)\s*(?:million|mil|m)(?:\s*euros?|\s*€)?'
    ]
    
    for pattern in value_patterns:
        match = re.search(pattern, query_lower)
        if match:
            value_str = match.group(1)
            value = clean_market_value_string(f"{value_str}m")
            if any(word in query_lower for word in ['under', 'below', 'less', 'budget', 'cheap', 'bargain', 'affordable']):
                filters['max_market_value'] = value
                logger.info(f"Detected max market value: €{value:,}")
            elif any(word in query_lower for word in ['over', 'above', 'more']):
                filters['min_market_value'] = value
                logger.info(f"Detected min market value: €{value:,}")
            value_extracted = True
            break
    
    if not value_extracted:
        for pattern in age_patterns:
            match = re.search(pattern, query_lower)
            if match:
                if 'between' in pattern or 'to' in pattern or '-' in pattern:
                    ages = [int(x) for x in match.groups() if x]
                    filters['min_age'] = min(ages)
                    filters['max_age'] = max(ages)
                    logger.info(f"Detected age range: {min(ages)}-{max(ages)}")
                else:
                    age = int(match.group(1))
                    if any(word in query_lower for word in ['under', 'below', 'younger', 'less than']):
                        filters['max_age'] = age
                        logger.info(f"Detected max age: {age}")
                    elif any(word in query_lower for word in ['over', 'above', 'older', 'more than', '+', 'plus']):
                        filters['min_age'] = age
                        logger.info(f"Detected min age: {age}")
                    else:
                        filters['target_age'] = age
                        logger.info(f"Detected target age: {age}")
                break
    
    if any(word in query_lower for word in ['young', 'youth']) and 'max_age' not in filters and 'min_age' not in filters:
        filters['max_age'] = 25
        logger.info("Detected 'young' keyword -> max age 25")
    
    performance_keywords = {
        'goalscorer': {'min_goals_per_game': 0.3}, 'prolific': {'min_goals_per_game': 0.4},
        'clinical': {'min_goals_per_game': 0.25}, 'creative': {'min_assists_per_game': 0.2},
        'playmaker': {'min_assists_per_game': 0.25}, 'experienced': {'min_appearances': 100},
        'veteran': {'min_appearances': 200}, 'young talent': {'max_age': 23},
        'prospect': {'max_age': 21}, 'emerging': {'max_age': 23, 'min_appearances': 20},
        'established': {'min_appearances': 150}, 'proven': {'min_appearances': 100},
    }
    
    for keyword, filter_dict in performance_keywords.items():
        if keyword in query_lower:
            filters.update(filter_dict)
            logger.info(f"Detected performance keyword: {keyword}")
    
    if any(word in query_lower for word in ['top', 'best', 'elite', 'world class', 'star', 'superstar']):
        if 'min_market_value' not in filters:
            filters['min_market_value'] = 20_000_000
            logger.info("Detected quality indicator -> min market value €20M")
    
    if any(word in query_lower for word in ['cheap', 'affordable', 'budget', 'low cost', 'bargain', 'value']):
        if 'max_market_value' not in filters:
            filters['max_market_value'] = 15_000_000
            logger.info("Detected budget indicator -> max market value €15M")
    
    if any(word in query_lower for word in ['free agent', 'contract expiring', 'out of contract', 'expiring']):
        filters['contract_status'] = 'expiring'
        logger.info("Detected contract status filter")
    
    logger.info(f"Final extracted filters: {filters}")
    return filters

def create_fallback_strategies(original_filters: Dict[str, Any]) -> List[Dict[str, Any]]:
    """Create smart fallback strategies that prioritize important filters."""
    fallback_strategies = []
    
    if len(original_filters) > 2:
        strategy1 = {}
        for key in ['position', 'min_age', 'max_age', 'target_age', 'min_market_value', 'max_market_value']:
            if key in original_filters:
                strategy1[key] = original_filters[key]
        if strategy1:
            fallback_strategies.append(strategy1)
    
    if 'position' in original_filters:
        strategy2 = {'position': original_filters['position']}
        for key in ['min_market_value', 'max_market_value']:
            if key in original_filters:
                strategy2[key] = original_filters[key]
        fallback_strategies.append(strategy2)
    
    if 'position' in original_filters and 'nationality' in original_filters:
        strategy3 = {
            'position': original_filters['position'],
            'nationality': original_filters['nationality']
        }
        fallback_strategies.append(strategy3)
    
    if 'position' in original_filters:
        strategy4 = {'position': original_filters['position']}
        fallback_strategies.append(strategy4)
    
    strategy5 = {}
    for key in ['nationality', 'min_age', 'max_age', 'target_age', 'min_market_value', 'max_market_value', 'target_leagues']:
        if key in original_filters:
            strategy5[key] = original_filters[key]
    if strategy5:
        fallback_strategies.append(strategy5)
    
    return fallback_strategies

## filtering player dataset based on requirements

In [22]:
def get_filtered_dataset_indices(filters: Dict[str, Any], player_metadata: List[Dict], detailed_profiles: List[Dict], league_mappings: Dict[str, List[str]]) -> List[int]:
    """Filter the entire dataset first and return valid indices."""
    logger.info(f"Step 1: Pre-filtering dataset with filters: {filters}")
    
    if not filters:
        all_indices = list(range(len(player_metadata)))
        logger.info(f"No filters applied - using all {len(all_indices)} players")
        return all_indices
    
    filtered_indices = []
    filter_stats = {
        'position_rejected': 0, 'league_rejected': 0, 'age_rejected': 0,
        'value_rejected': 0, 'nationality_rejected': 0, 'performance_rejected': 0,
        'total_processed': 0
    }
    
    for idx in range(len(player_metadata)):
        try:
            filter_stats['total_processed'] += 1
            metadata = player_metadata[idx]
            detailed = detailed_profiles[idx] if idx < len(detailed_profiles) else {}
            passed_all_filters = True
            
            # Position filtering
            if 'position' in filters:
                filter_pos = filters['position'].lower()
                position_match = False
                player_sub_position = detailed.get('sub_position', '').lower() or metadata.get('sub_position', '').lower()
                
                if player_sub_position:
                    if filter_pos in player_sub_position or player_sub_position in filter_pos:
                        position_match = True
                    elif filter_pos.replace('-', ' ') in player_sub_position.replace('-', ' '):
                        position_match = True
                    elif player_sub_position.replace('-', ' ') in filter_pos.replace('-', ' '):
                        position_match = True
                    # Specific sub-position matches
                    elif filter_pos in ['striker', 'centre-forward', 'center-forward'] and 'centre-forward' in player_sub_position:
                        position_match = True
                    elif filter_pos in ['second striker', 'support striker', 'false 9'] and 'second striker' in player_sub_position:
                        position_match = True
                    elif filter_pos in ['left winger', 'left wing'] and 'left winger' in player_sub_position:
                        position_match = True
                    elif filter_pos in ['right winger', 'right wing'] and 'right winger' in player_sub_position:
                        position_match = True
                    elif filter_pos in ['attacking midfielder', 'cam', 'playmaker'] and 'attacking midfield' in player_sub_position:
                        position_match = True
                    elif filter_pos in ['central midfielder', 'cm'] and 'central midfield' in player_sub_position:
                        position_match = True
                    elif filter_pos in ['defensive midfielder', 'cdm', 'holding midfielder'] and 'defensive midfield' in player_sub_position:
                        position_match = True
                    elif filter_pos in ['left midfielder'] and 'left midfield' in player_sub_position:
                        position_match = True
                    elif filter_pos in ['right midfielder'] and 'right midfield' in player_sub_position:
                        position_match = True
                    elif filter_pos in ['centre-back', 'center-back', 'central defender', 'cb'] and 'centre-back' in player_sub_position:
                        position_match = True
                    elif filter_pos in ['left-back', 'left back', 'lb'] and 'left-back' in player_sub_position:
                        position_match = True
                    elif filter_pos in ['right-back', 'right back', 'rb'] and 'right-back' in player_sub_position:
                        position_match = True
                    # Category matching for broad queries
                    elif filter_pos == 'forward' and any(term in player_sub_position for term in ['centre-forward', 'second striker']):
                        position_match = True
                    elif filter_pos == 'winger' and any(term in player_sub_position for term in ['left winger', 'right winger']):
                        position_match = True
                    elif filter_pos == 'midfielder' and any(term in player_sub_position for term in ['attacking midfield', 'central midfield', 'defensive midfield', 'left midfield', 'right midfield']):
                        position_match = True
                    elif filter_pos == 'defender' and any(term in player_sub_position for term in ['centre-back', 'left-back', 'right-back']):
                        position_match = True
                    elif filter_pos == 'goalkeeper' and 'goalkeeper' in player_sub_position:
                        position_match = True
                
                if not position_match:
                    filter_stats['position_rejected'] += 1
                    passed_all_filters = False
                    continue
            
            # Age filtering
            player_age = metadata.get('age', 0)
            if player_age > 0:
                if 'min_age' in filters and player_age < filters['min_age']:
                    filter_stats['age_rejected'] += 1
                    passed_all_filters = False
                    continue
                if 'max_age' in filters and player_age > filters['max_age']:
                    filter_stats['age_rejected'] += 1
                    passed_all_filters = False
                    continue
                if 'target_age' in filters and abs(player_age - filters['target_age']) > 2:
                    filter_stats['age_rejected'] += 1
                    passed_all_filters = False
                    continue
            
            # Market value filtering
            player_value = metadata.get('market_value', 0)
            if 'min_market_value' in filters and player_value < filters['min_market_value']:
                filter_stats['value_rejected'] += 1
                passed_all_filters = False
                continue
            if 'max_market_value' in filters and player_value > filters['max_market_value']:
                filter_stats['value_rejected'] += 1
                passed_all_filters = False
                continue
            
            # Nationality filtering
            if 'nationality' in filters:
                player_nat = metadata.get('nationality', '').lower()
                filter_nat = filters['nationality'].lower()
                if not (filter_nat == player_nat or filter_nat in player_nat or player_nat in filter_nat):
                    filter_stats['nationality_rejected'] += 1
                    passed_all_filters = False
                    continue
            
            # League filtering
            if 'target_leagues' in filters:
                league_match = False
                current_club = metadata.get('current_club', '').lower()
                
                # Check current club against static club-league mapping
                for club_keyword, league_code in club_league_mapping.items():
                    if club_keyword in current_club:
                        for target_league in filters['target_leagues']:
                            if target_league.lower() == league_code or target_league.lower() in league_code:
                                league_match = True
                                break
                        if league_match: break
                
                # If no club match, check detailed profile's league experience
                if not league_match:
                    career_stats = detailed.get('career_stats', {})
                    league_experience = career_stats.get('league_experience', {})
                    top_5_leagues = league_experience.get('top_5_leagues', [])
                    european_comps = league_experience.get('european_competitions', [])
                    
                    for league_info in top_5_leagues:
                        league_name = league_info.get('league', '').lower()
                        for target_league in filters['target_leagues']:
                            target_lower = target_league.lower()
                            if (target_lower in league_name or league_name in target_lower or
                                (target_lower == 'gb1' and 'premier' in league_name) or
                                (target_lower == 'es1' and 'liga' in league_name) or
                                (target_lower == 'l1' and 'bundesliga' in league_name) or
                                (target_lower == 'it1' and 'serie' in league_name) or
                                (target_lower == 'fr1' and 'ligue' in league_name)):
                                league_match = True
                                break
                        if league_match: break
                    
                    # Also consider European competitions if broad "top/european leagues" is queried
                    if not league_match and any('top' in str(tl).lower() or 'european' in str(tl).lower() for tl in filters['target_leagues']):
                        if len(european_comps) > 0 or len(top_5_leagues) > 0:
                            league_match = True
                
                if not league_match:
                    filter_stats['league_rejected'] += 1
                    passed_all_filters = False
                    continue
            
            # Performance filtering
            if any(key in filters for key in ['min_goals_per_game', 'min_assists_per_game', 'min_appearances']):
                career_stats = detailed.get('career_stats', {})
                
                if 'min_goals_per_game' in filters:
                    goals_per_game = career_stats.get('goals_per_appearance', 0)
                    if goals_per_game < filters['min_goals_per_game']:
                        filter_stats['performance_rejected'] += 1
                        passed_all_filters = False
                        continue
                
                if 'min_assists_per_game' in filters:
                    assists_per_game = career_stats.get('assists_per_appearance', 0)
                    if assists_per_game < filters['min_assists_per_game']:
                        filter_stats['performance_rejected'] += 1
                        passed_all_filters = False
                        continue
                
                if 'min_appearances' in filters:
                    appearances = career_stats.get('total_appearances', 0)
                    if appearances < filters['min_appearances']:
                        filter_stats['performance_rejected'] += 1
                        passed_all_filters = False
                        continue
            
            if passed_all_filters:
                filtered_indices.append(idx)
                
        except Exception as e:
            logger.warning(f"Error filtering player {idx}: {e}")
            continue
    
    logger.info(f"Pre-filtering results:")
    logger.info(f"  Total players processed: {filter_stats['total_processed']}")
    logger.info(f"  Players passing all filters: {len(filtered_indices)}")
    for filter_type, count in filter_stats.items():
        if count > 0 and filter_type != 'total_processed':
            logger.info(f"  {filter_type}: {count} players rejected")
    
    if len(filtered_indices) == 0:
        logger.warning("No players found with all filters! Applying fallback strategy...")
        return apply_fallback_strategies(filters, player_metadata, detailed_profiles, league_mappings)
    
    return filtered_indices

def apply_fallback_strategies(original_filters: Dict[str, Any], player_metadata: List[Dict], detailed_profiles: List[Dict], league_mappings: Dict[str, List[str]]) -> List[int]:
    """Apply fallback strategies without recursion."""
    fallback_strategies = create_fallback_strategies(original_filters)
    
    for attempt_num, fallback_filters in enumerate(fallback_strategies, 1):
        logger.info(f"Fallback attempt {attempt_num}: {fallback_filters}")
        
        fallback_indices = []
        filter_stats = { # Reset stats for each fallback attempt
            'position_rejected': 0, 'league_rejected': 0, 'age_rejected': 0,
            'value_rejected': 0, 'nationality_rejected': 0, 'performance_rejected': 0,
            'total_processed': 0
        }
        
        for idx in range(len(player_metadata)):
            try:
                filter_stats['total_processed'] += 1
                metadata = player_metadata[idx]
                detailed = detailed_profiles[idx] if idx < len(detailed_profiles) else {}
                passed_all_filters = True
                
                # Filtering logic (simplified for fallbacks, especially for league/performance)
                if 'position' in fallback_filters:
                    filter_pos = fallback_filters['position'].lower()
                    position_match = False
                    player_sub_position = detailed.get('sub_position', '').lower() or metadata.get('sub_position', '').lower()
                    
                    if player_sub_position:
                        if filter_pos in player_sub_position or player_sub_position in filter_pos:
                            position_match = True
                        elif filter_pos.replace('-', ' ') in player_sub_position.replace('-', ' '):
                            position_match = True
                        elif player_sub_position.replace('-', ' ') in filter_pos.replace('-', ' '):
                            position_match = True
                        elif filter_pos in ['striker', 'centre-forward', 'center-forward'] and 'centre-forward' in player_sub_position:
                            position_match = True
                        elif filter_pos in ['second striker', 'support striker', 'false 9'] and 'second striker' in player_sub_position:
                            position_match = True
                        elif filter_pos in ['left winger', 'left wing'] and 'left winger' in player_sub_position:
                            position_match = True
                        elif filter_pos in ['right winger', 'right wing'] and 'right winger' in player_sub_position:
                            position_match = True
                        elif filter_pos in ['attacking midfielder', 'cam', 'playmaker'] and 'attacking midfield' in player_sub_position:
                            position_match = True
                        elif filter_pos in ['central midfielder', 'cm'] and 'central midfield' in player_sub_position:
                            position_match = True
                        elif filter_pos in ['defensive midfielder', 'cdm', 'holding midfielder'] and 'defensive midfield' in player_sub_position:
                            position_match = True
                        elif filter_pos in ['left midfielder'] and 'left midfield' in player_sub_position:
                            position_match = True
                        elif filter_pos in ['right midfielder'] and 'right midfield' in player_sub_position:
                            position_match = True
                        elif filter_pos in ['centre-back', 'center-back', 'central defender', 'cb'] and 'centre-back' in player_sub_position:
                            position_match = True
                        elif filter_pos in ['left-back', 'left back', 'lb'] and 'left-back' in player_sub_position:
                            position_match = True
                        elif filter_pos in ['right-back', 'right back', 'rb'] and 'right-back' in player_sub_position:
                            position_match = True
                        elif filter_pos == 'forward' and any(term in player_sub_position for term in ['centre-forward', 'second striker']):
                            position_match = True
                        elif filter_pos == 'winger' and any(term in player_sub_position for term in ['left winger', 'right winger']):
                            position_match = True
                        elif filter_pos == 'midfielder' and any(term in player_sub_position for term in ['attacking midfield', 'central midfield', 'defensive midfield', 'left midfield', 'right midfield']):
                            position_match = True
                        elif filter_pos == 'defender' and any(term in player_sub_position for term in ['centre-back', 'left-back', 'right-back']):
                            position_match = True
                        elif filter_pos == 'goalkeeper' and 'goalkeeper' in player_sub_position:
                            position_match = True
                    
                    if not position_match:
                        filter_stats['position_rejected'] += 1
                        passed_all_filters = False
                        continue
                
                player_age = metadata.get('age', 0)
                if player_age > 0:
                    if 'min_age' in fallback_filters and player_age < fallback_filters['min_age']:
                        filter_stats['age_rejected'] += 1
                        passed_all_filters = False
                        continue
                    if 'max_age' in fallback_filters and player_age > fallback_filters['max_age']:
                        filter_stats['age_rejected'] += 1
                        passed_all_filters = False
                        continue
                    if 'target_age' in fallback_filters and abs(player_age - fallback_filters['target_age']) > 2:
                        filter_stats['age_rejected'] += 1
                        passed_all_filters = False
                        continue
                
                player_value = metadata.get('market_value', 0)
                if 'min_market_value' in fallback_filters and player_value < fallback_filters['min_market_value']:
                    filter_stats['value_rejected'] += 1
                    passed_all_filters = False
                    continue
                if 'max_market_value' in fallback_filters and player_value > fallback_filters['max_market_value']:
                    filter_stats['value_rejected'] += 1
                    passed_all_filters = False
                    continue
                
                if 'nationality' in fallback_filters:
                    player_nat = metadata.get('nationality', '').lower()
                    filter_nat = fallback_filters['nationality'].lower()
                    if not (filter_nat == player_nat or filter_nat in player_nat or player_nat in filter_nat):
                        filter_stats['nationality_rejected'] += 1
                        passed_all_filters = False
                        continue
                
                # League and performance filters are generally handled by removal in create_fallback_strategies
                # or looser checking here. The crucial part is that they don't block aggressively if a fallback is used.
                if 'target_leagues' in fallback_filters: # Only if this filter is specifically kept in the fallback strategy
                    league_match = False
                    current_club = metadata.get('current_club', '').lower()
                    for club_keyword, league_code in club_league_mapping.items():
                        if club_keyword in current_club:
                            for target_league in fallback_filters['target_leagues']:
                                if target_league.lower() == league_code or target_league.lower() in league_code:
                                    league_match = True
                                    break
                            if league_match: break
                    if not league_match:
                        career_stats = detailed.get('career_stats', {})
                        league_experience = career_stats.get('league_experience', {})
                        top_5_leagues = league_experience.get('top_5_leagues', [])
                        european_comps = league_experience.get('european_competitions', [])
                        for league_info in top_5_leagues:
                            league_name = league_info.get('league', '').lower()
                            for target_league in fallback_filters['target_leagues']:
                                target_lower = target_league.lower()
                                if (target_lower in league_name or league_name in target_lower or
                                    (target_lower == 'gb1' and 'premier' in league_name) or
                                    (target_lower == 'es1' and 'liga' in league_name) or
                                    (target_lower == 'l1' and 'bundesliga' in league_name) or
                                    (target_lower == 'it1' and 'serie' in league_name) or
                                    (target_lower == 'fr1' and 'ligue' in league_name)):
                                    league_match = True
                                    break
                            if league_match: break
                        if not league_match and any('top' in str(tl).lower() or 'european' in str(tl).lower() for tl in fallback_filters['target_leagues']):
                            if len(european_comps) > 0 or len(top_5_leagues) > 0:
                                league_match = True
                    if not league_match:
                        filter_stats['league_rejected'] += 1
                        passed_all_filters = False
                        continue
                
                if any(key in fallback_filters for key in ['min_goals_per_game', 'min_assists_per_game', 'min_appearances']):
                    career_stats = detailed.get('career_stats', {})
                    if 'min_goals_per_game' in fallback_filters:
                        goals_per_game = career_stats.get('goals_per_appearance', 0)
                        if goals_per_game < fallback_filters['min_goals_per_game']:
                            filter_stats['performance_rejected'] += 1
                            passed_all_filters = False
                            continue
                    if 'min_assists_per_game' in fallback_filters:
                        assists_per_game = career_stats.get('assists_per_appearance', 0)
                        if assists_per_game < fallback_filters['min_assists_per_game']:
                            filter_stats['performance_rejected'] += 1
                            passed_all_filters = False
                            continue
                    if 'min_appearances' in fallback_filters:
                        appearances = career_stats.get('total_appearances', 0)
                        if appearances < fallback_filters['min_appearances']:
                            filter_stats['performance_rejected'] += 1
                            passed_all_filters = False
                            continue

                if passed_all_filters:
                    fallback_indices.append(idx)
                    
            except Exception as e:
                logger.warning(f"Error in fallback filtering for player {idx}: {e}")
                continue
        
        logger.info(f"Fallback attempt {attempt_num} results: {len(fallback_indices)} players found")
        
        if len(fallback_indices) > 0:
            logger.info(f"Fallback attempt {attempt_num} successful!")
            return fallback_indices
    
    logger.warning("All fallback attempts failed! Using entire dataset.")
    return list(range(len(player_metadata)))

## ranking and getting players

In [23]:
def vector_similarity_search_on_filtered_dataset(query: str, filtered_indices: List[int], player_embeddings: np.ndarray, embedding_model: SentenceTransformer, top_k: int = 100) -> List[Tuple[int, float]]:
    """Perform vector similarity search only on pre-filtered dataset."""
    try:
        logger.info(f"Step 2: Vector similarity search on {len(filtered_indices)} pre-filtered players...")
        
        if len(filtered_indices) == 0:
            logger.warning("No players in filtered dataset!")
            return []
        
        with torch.no_grad():
            query_embedding = embedding_model.encode(
                query,
                convert_to_numpy=True,
                show_progress_bar=False,
                batch_size=1,
                device='cpu',
                normalize_embeddings=True
            ).reshape(1, -1)
        
        filtered_embeddings = player_embeddings[filtered_indices]
        
        similarities = cosine_similarity(query_embedding, filtered_embeddings)[0]
        
        top_k_actual = min(top_k, len(filtered_indices))
        top_local_indices = np.argsort(similarities)[::-1][:top_k_actual]
        
        results = [
            (filtered_indices[local_idx], float(similarities[local_idx])) 
            for local_idx in top_local_indices
        ]
        
        logger.info(f"Found {len(results)} candidates from filtered dataset")
        logger.info(f"Similarity range: {min(similarities):.3f} to {max(similarities):.3f}")
        
        return results
    except Exception as e:
        logger.error(f"Filtered vector search failed: {e}")
        if filtered_indices:
            indices = np.random.choice(
                filtered_indices, 
                size=min(top_k, len(filtered_indices)), 
                replace=False
            )
            return [(int(idx), 0.5) for idx in indices]
        return []

def rerank_candidates(query: str, candidates: List[Tuple[int, float]], player_metadata: List[Dict], reranker_model: CrossEncoder, top_k: int = 30) -> List[Tuple[int, float]]:
    """Rerank candidates using cross-encoder with fallback logic."""
    try:
        logger.info(f"Step 3: Reranking {len(candidates)} candidates with cross-encoder...")
        
        if len(candidates) == 0:
            return []
        
        query_doc_pairs = []
        candidate_indices = []
        original_scores = []
        
        for idx, similarity_score in candidates:
            try:
                player_text = player_metadata[idx]['embedding_text']
                query_doc_pairs.append([query, player_text])
                candidate_indices.append(idx)
                original_scores.append(similarity_score)
            except (IndexError, KeyError) as e:
                logger.warning(f"Skipping invalid candidate {idx}: {e}")
                continue
        
        if not query_doc_pairs:
            logger.warning("No valid query-document pairs for reranking")
            return candidates[:top_k]
        
        try:
            logger.info(f"Reranking {len(query_doc_pairs)} candidates...")
            rerank_scores = reranker_model.predict(query_doc_pairs)
            
            reranked_results = [
                (candidate_indices[i], float(rerank_scores[i]))
                for i in range(len(candidate_indices))
            ]
            
            reranked_results.sort(key=lambda x: x[1], reverse=True)
            
            top_reranked = reranked_results[:top_k]
            
            logger.info(f"Reranked to top {len(top_reranked)} most relevant players")
            logger.info(f"Rerank score range: {reranked_results[-1][1]:.3f} to {reranked_results[0][1]:.3f}")
            
            return top_reranked
        except Exception as rerank_error:
            logger.error(f"Reranking failed: {rerank_error}")
            logger.warning("Falling back to similarity scores...")
            
            fallback_results = [
                (candidate_indices[i], original_scores[i])
                for i in range(len(candidate_indices))
            ]
            
            fallback_results.sort(key=lambda x: x[1], reverse=True)
            
            top_fallback = fallback_results[:top_k]
            
            logger.info(f"📋 Using top {len(top_fallback)} candidates by similarity")
            return top_fallback
            
    except Exception as e:
        logger.error(f"Complete reranking failure: {e}")
        logger.warning("Using original candidate order...")
        return candidates[:top_k]

def prepare_player_data(ranked_players: List[Tuple[int, float]], player_metadata: List[Dict], detailed_profiles: List[Dict]) -> List[Dict[str, Any]]:
    """Prepare structured player data from ranked indices."""
    logger.info(f"Step 4: Preparing player data for {len(ranked_players)} players...")
    player_data = []
    
    for rank, (idx, relevance_score) in enumerate(ranked_players, 1):
        try:
            metadata = player_metadata[idx]
            detailed = detailed_profiles[idx] if idx < len(detailed_profiles) else {}
            
            career_stats = detailed.get('career_stats', {})
            playing_style = detailed.get('playing_style', {})
            transfer_history = detailed.get('transfer_history', {})
            market_trends = detailed.get('market_value_trends', {})
            
            sub_position = detailed.get('sub_position', '') or metadata.get('sub_position', '')
            
            player_info = {
                'rank': rank,
                'relevance_score': round(relevance_score, 3),
                'basic_info': {
                    'name': metadata.get('name', 'Unknown'),
                    'age': metadata.get('age', 0),
                    'position': sub_position,
                    'sub_position': sub_position,
                    'nationality': metadata.get('nationality', 'Unknown'),
                    'current_club': metadata.get('current_club', 'Free Agent'),
                    'market_value': metadata.get('market_value', 0)
                },
                'performance': {
                    'total_appearances': career_stats.get('total_appearances', 0),
                    'total_goals': career_stats.get('total_goals', 0),
                    'total_assists': career_stats.get('total_assists', 0),
                    'goals_per_game': round(career_stats.get('goals_per_appearance', 0), 3),
                    'assists_per_game': round(career_stats.get('assists_per_appearance', 0), 3),
                    'minutes_played': career_stats.get('total_minutes', 0)
                },
                'profile': {
                    'experience_level': playing_style.get('experience_level', 'unknown'),
                    'goal_scoring_ability': playing_style.get('goal_scoring_ability', 'unknown'),
                    'discipline': playing_style.get('discipline', 'unknown'),
                    'transfer_count': transfer_history.get('total_transfers', 0),
                    'career_trajectory': transfer_history.get('career_trajectory', 'unknown')
                },
                'market_info': {
                    'peak_value': market_trends.get('peak_market_value', 0),
                    'value_trend': market_trends.get('value_trend', 'stable'),
                    'value_change': market_trends.get('recent_value_change', 0)
                },
                'scouting_summary': metadata.get('embedding_text', '')
            }
            player_data.append(player_info)
        except Exception as e:
            logger.warning(f"Error preparing data for player {idx}: {e}")
            continue
    
    logger.info(f"Prepared data for {len(player_data)} players")
    return player_data

## gemini integration
- this is to get our final scouting report

In [24]:
def create_gemini_prompt(query: str, top_players: List[Dict[str, Any]]) -> str:
    """Create optimized prompt for Gemini."""
    prompt = f"""You are a professional soccer scout with 20+ years of experience analyzing players for top clubs worldwide. Provide expert analysis for this scouting request.

SCOUTING REQUEST: "{query}"

TOP RECOMMENDED PLAYERS:
"""
    for i, player in enumerate(top_players, 1):
        basic = player['basic_info']
        perf = player['performance']
        profile = player['profile']
        market = player['market_info']
        
        prompt += f"""
Player {i}: {basic['name']}
• Position: {basic['position']}{f" (detailed: {basic['sub_position']})" if basic.get('sub_position') and basic['sub_position'] != basic['position'] else ""} | Age: {basic['age']} | Nation: {basic['nationality']}
• Current Club: {basic['current_club']} | Market Value: €{basic['market_value']:,}
• Career Stats: {perf['total_goals']} goals, {perf['total_assists']} assists in {perf['total_appearances']} appearances
• Performance Ratios: {perf['goals_per_game']:.3f} goals/game, {perf['assists_per_game']:.3f} assists/game
• Player Profile: {profile['experience_level']} player, {profile['goal_scoring_ability']} goalscorer, {profile['discipline']} discipline
• Career: {profile['transfer_count']} transfers, {profile['career_trajectory']} trajectory
• Market: Peak value €{market['peak_value']:,}, trend: {market['value_trend']}
• Relevance Score: {player['relevance_score']:.3f}/1.0
"""
    prompt += f"""
REQUIRED ANALYSIS:
1. **Match Assessment** (2-3 sentences): How well do these players meet the specific criteria?
2. **Top Recommendation** (3-4 sentences): Deep dive on #1 pick - strengths, style, why they're perfect
3. **Alternative Options** (2-3 sentences): Quick analysis of players 2-3 and their unique value
4. **Key Statistics** (2 sentences): Most important performance metrics that stand out
5. **Scouting Concerns** (1-2 sentences): Any potential risks or areas to investigate
6. **Final Verdict** (1-2 sentences): Clear recommendation for the scout

Keep the analysis professional, data-driven, and actionable. Focus on insights that help make informed scouting decisions. Write in a confident, expert tone. Total length: 350-450 words.
"""
    return prompt

def generate_gemini_response(query: str, player_data: List[Dict[str, Any]], gemini_model: Optional[genai.GenerativeModel], top_n: int = 5) -> Dict[str, Any]:
    """Generate response using Gemini or fallback to template."""
    logger.info("Step 5: Generating Gemini response...")
    
    top_players = player_data[:top_n]
    
    if not gemini_model:
        logger.warning("Gemini not available - using template response")
        return generate_template_response(query, top_players)
    
    try:
        prompt = create_gemini_prompt(query, top_players)
        response = gemini_model.generate_content(prompt)
        ai_response = response.text
        
        logger.info("Gemini response generated successfully")
        
        return {
            'success': True,
            'query': query,
            'ai_response': ai_response,
            'top_players': top_players,
            'total_candidates_found': len(player_data),
            'search_metadata': {
                'timestamp': datetime.now().isoformat(),
                'model_used': 'gemini-1.5-flash',
                'reranking_applied': True,
                'filters_applied': True,
                'filter_first_approach': True
            }
        }
    except Exception as e:
        logger.error(f"Gemini API failed: {e}")
        return generate_template_response(query, top_players)

def generate_template_response(query: str, top_players: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Generate fallback template response."""
    if not top_players:
        ai_response = f"I couldn't find any players matching your criteria: '{query}'. Try adjusting your search parameters."
    else:
        top_player = top_players[0]
        basic = top_player['basic_info']
        perf = top_player['performance']
        
        response_parts = [
            f"Based on your query '{query}', I've identified {len(top_players)} potential candidates.",
            f"My top recommendation is {basic['name']}, a {basic['age']}-year-old {basic['position']} from {basic['nationality']}.",
            f"Currently at {basic['current_club']}, they have scored {perf['total_goals']} goals in {perf['total_appearances']} appearances ({perf['goals_per_game']:.2f} per game).",
            f"With a market value of €{basic['market_value']:,}, they represent excellent value for your scouting requirements."
        ]
        
        if len(top_players) > 1:
            response_parts.append(f"Other strong candidates include {top_players[1]['basic_info']['name']} and {top_players[2]['basic_info']['name'] if len(top_players) > 2 else 'others'}, each bringing unique strengths to consider.")
        
        ai_response = " ".join(response_parts)
    
    return {
        'success': True,
        'query': query,
        'ai_response': ai_response,
        'top_players': top_players,
        'total_candidates_found': len(top_players), # Adjusted for template to reflect only top_n
        'search_metadata': {
            'timestamp': datetime.now().isoformat(),
            'model_used': 'template_fallback',
            'reranking_applied': False, # Reranking wasn't explicitly used if Gemini failed
            'filters_applied': True,
            'filter_first_approach': True
        }
    }

In [25]:
def get_player_details(player_name: str, player_metadata: List[Dict], detailed_profiles: List[Dict]) -> Dict[str, Any]:
    """Get detailed information about a specific player."""
    logger.info(f"Getting details for: {player_name}")
    
    for i, metadata in enumerate(player_metadata):
        if player_name.lower() in metadata['name'].lower():
            detailed = (detailed_profiles[i] if i < len(detailed_profiles) else {})
            
            # Prioritize detailed sub_position, fallback to metadata's sub_position, then 'Unknown'
            sub_position_val = detailed.get('sub_position', metadata.get('sub_position', 'Unknown'))
            
            return {
                'success': True,
                'player': {
                    'basic_info': {
                        'name': metadata.get('name'),
                        'age': metadata.get('age'),
                        'position': sub_position_val,
                        'sub_position': sub_position_val,
                        'nationality': metadata.get('nationality'),
                        'current_club': metadata.get('current_club'),
                        'market_value': metadata.get('market_value')
                    },
                    'detailed_profile': detailed,
                    'embedding_text': metadata.get('embedding_text', '')
                }
            }
    
    return {
        'success': False,
        'error': f"Player '{player_name}' not found in database"
    }

def get_similar_players(player_name: str, player_embeddings: np.ndarray, player_metadata: List[Dict], detailed_profiles: List[Dict], top_n: int = 10) -> Dict[str, Any]:
    """Find players similar to a given player."""
    logger.info(f"Finding players similar to: {player_name}")
    
    target_idx = None
    for i, metadata in enumerate(player_metadata):
        if player_name.lower() in metadata['name'].lower():
            target_idx = i
            break
    
    if target_idx is None:
        return {
            'success': False,
            'error': f"Player '{player_name}' not found"
        }
    
    try:
        target_embedding = player_embeddings[target_idx].reshape(1, -1)
        
        similarities = cosine_similarity(target_embedding, player_embeddings)[0]
        
        similarities[target_idx] = -1 # Exclude self
        
        top_indices = np.argsort(similarities)[::-1][:top_n]
        similar_players_raw = [(int(idx), float(similarities[idx])) for idx in top_indices]
        
        player_data = prepare_player_data(similar_players_raw, player_metadata, detailed_profiles)
        
        return {
            'success': True,
            'target_player': player_name,
            'similar_players': player_data,
            'search_metadata': {
                'timestamp': datetime.now().isoformat(),
                'similarity_method': 'cosine_similarity'
            }
        }
    except Exception as e:
        logger.error(f"Similarity search failed: {e}")
        return {
            'success': False,
            'error': f"Failed to find similar players: {str(e)}"
        }

def get_system_stats(player_metadata: List[Dict], player_embeddings: np.ndarray, gemini_model: Optional[genai.GenerativeModel], embedding_model_name: str, reranker_model_name: str) -> Dict[str, Any]:
    """Get system statistics and health."""
    return {
        'success': True,
        'stats': {
            'total_players': len(player_metadata),
            'embedding_dimension': player_embeddings.shape[1] if player_embeddings is not None else 0,
            'models': {
                'embedding_model': embedding_model_name,
                'reranker_model': reranker_model_name,
                'gemini_available': gemini_model is not None
            },
            'system_health': 'operational',
            'approach': 'filter_first_rag',
            'last_updated': datetime.now().isoformat()
        }
    }

## loading models for embeddings and gemini key iniitalization

In [None]:
EMBEDDINGS_DIR = Path("embeddings") # might need to change this depending on where you put the embeddings folder
GEMINI_API_KEY = "put key here"  # change this to your api key
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
RERANKER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"

logger.info("Loading embeddings and metadata...")
player_embeddings: Optional[np.ndarray] = None
player_metadata: Optional[List[Dict]] = None
detailed_profiles: Optional[List[Dict]] = None

try:
    embeddings_file = EMBEDDINGS_DIR / "player_embeddings.npy"
    if embeddings_file.exists():
        player_embeddings = np.load(embeddings_file)
        logger.info(f"Loaded {player_embeddings.shape[0]} player embeddings")
    else:
        raise FileNotFoundError(f"Embeddings file not found: {embeddings_file}")
    
    metadata_file = EMBEDDINGS_DIR / "player_metadata.json"
    if metadata_file.exists():
        with open(metadata_file, 'r', encoding='utf-8') as f:
            player_metadata = json.load(f)
        logger.info(f"Loaded metadata for {len(player_metadata)} players")
    else:
        raise FileNotFoundError(f"Metadata file not found: {metadata_file}")
    
    profiles_file = EMBEDDINGS_DIR / "detailed_player_profiles.json"
    if profiles_file.exists():
        with open(profiles_file, 'r', encoding='utf-8') as f:
            detailed_profiles = json.load(f)
        logger.info(f"Loaded detailed profiles for {len(detailed_profiles)} players")
    else:
        logger.warning(f"Detailed profiles file not found at {profiles_file}. Some player details might be missing.")
        detailed_profiles = [{}] * len(player_metadata) # Initialize with empty dicts to match length
        
except Exception as e:
    logger.error(f"Failed to load embeddings and/or metadata: {e}")
    # Exit or handle gracefully if data is missing
    raise

# initialize models
logger.info("Initializing ML models...")
embedding_model: Optional[SentenceTransformer] = None
reranker_model: Optional[CrossEncoder] = None

try:
    embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device='cpu')
    logger.info(f"Loaded embedding model: {EMBEDDING_MODEL_NAME}")
    
    reranker_model = CrossEncoder(RERANKER_MODEL_NAME, device='cpu')
    logger.info(f"Loaded reranker model: {RERANKER_MODEL_NAME}")
    
except Exception as e:
    logger.error(f"Failed to initialize embedding/reranker models: {e}")
    # Handle gracefully; search functions will need to check if models are None
    embedding_model = None
    reranker_model = None

# gemini setup
print(f"Current GEMINI_API_KEY: {GEMINI_API_KEY}")
gemini_model: Optional[genai.GenerativeModel] = None
if GEMINI_API_KEY and GEMINI_API_KEY != "change this":
    try:
        genai.configure(api_key=GEMINI_API_KEY)
        gemini_model = genai.GenerativeModel('gemini-1.5-flash')
        logger.info("Gemini API configured successfully")
    except Exception as e:
        logger.error(f"Gemini setup failed: {e}. Ensure API key is valid and has access.")
        gemini_model = None
else:
    logger.warning("No Gemini API key provided or placeholder used. Gemini response will be unavailable.")

logger.info("System initialization complete!")


league_mappings = initialize_league_mappings()

2025-06-26 14:58:20,727 - INFO - Loading embeddings and metadata...
2025-06-26 14:58:20,775 - INFO - Loaded 6148 player embeddings
2025-06-26 14:58:20,807 - INFO - Loaded metadata for 6148 players
2025-06-26 14:58:20,998 - INFO - Loaded detailed profiles for 6148 players
2025-06-26 14:58:20,998 - INFO - Initializing ML models...
2025-06-26 14:58:21,004 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-06-26 14:58:22,384 - INFO - Loaded embedding model: all-MiniLM-L6-v2
2025-06-26 14:58:23,727 - INFO - Loaded reranker model: cross-encoder/ms-marco-MiniLM-L-6-v2
2025-06-26 14:58:23,728 - INFO - System initialization complete!


Current GEMINI_API_KEY: AIzaSyAV8wl_U6ArV7Xc2i-D-42SC0JfZ1h-kd8


## getting our players

In [27]:
def run_search(query: str, top_k_initial: int = 100, top_k_rerank: int = 30, top_n_final: int = 5) -> Dict[str, Any]:
    """
    Main search function with filter-first approach.
    Assumes player_embeddings, player_metadata, detailed_profiles, embedding_model,
    reranker_model, gemini_model, and league_mappings are available in the global scope.
    """
    
    logger.info(f"Starting FILTER-FIRST RAG pipeline for query: '{query}'")
    
    if player_metadata is None or player_embeddings is None or embedding_model is None or reranker_model is None:
        return {
            'success': False,
            'query': query,
            'error': 'RAG system not fully initialized. Missing data or models.',
            'ai_response': 'The scouting system is not fully operational. Please check data and model loading.',
            'top_players': [],
            'total_candidates_found': 0,
            'search_metadata': {
                'timestamp': datetime.now().isoformat(),
                'filter_first_approach': True,
                'filtered_dataset_size': 0
            }
        }
        
    try:
        # Step 1: Parse query and extract filters
        filters = parse_query(query, league_mappings)
        
        # Step 2: Pre-filter the entire dataset based on extracted filters
        filtered_indices = get_filtered_dataset_indices(filters, player_metadata, detailed_profiles, league_mappings)
        
        if len(filtered_indices) == 0:
            logger.error("No players found after filtering! Returning empty results.")
            return {
                'success': False,
                'query': query,
                'error': 'No players match the specified criteria after strict filtering.',
                'ai_response': f"No players found matching your specific criteria: '{query}'. Please broaden your search or adjust filters.",
                'top_players': [],
                'total_candidates_found': 0,
                'search_metadata': {
                    'timestamp': datetime.now().isoformat(),
                    'filter_first_approach': True,
                    'filtered_dataset_size': 0
                }
            }
        
        # Step 3: Perform vector similarity search ONLY on filtered dataset
        vector_candidates = vector_similarity_search_on_filtered_dataset(
            query, filtered_indices, player_embeddings, embedding_model, top_k_initial
        )
        
        # Step 4: Rerank the candidates from filtered dataset
        reranked_candidates = rerank_candidates(query, vector_candidates, player_metadata, reranker_model, top_k_rerank)
        
        # Step 5: Prepare player data for LLM
        player_data = prepare_player_data(reranked_candidates, player_metadata, detailed_profiles)
        
        # Step 6: Generate final response using Gemini
        final_response = generate_gemini_response(query, player_data, gemini_model, top_n_final)
        
        # Add metadata about the filtering process
        final_response['search_metadata'].update({
            'filter_first_approach': True,
            'original_dataset_size': len(player_metadata),
            'filtered_dataset_size': len(filtered_indices),
            'filters_applied': filters,
            'filtering_efficiency': f"{len(filtered_indices)}/{len(player_metadata)} players retained"
        })
        
        logger.info("FILTER-FIRST RAG pipeline completed successfully!")
        logger.info(f"Dataset reduction: {len(player_metadata)} → {len(filtered_indices)} players ({len(filtered_indices)/len(player_metadata)*100:.1f}% retained)")
        
        return final_response
        
    except Exception as e:
        logger.error(f"FILTER-FIRST RAG pipeline failed: {e}")
        return {
            'success': False,
            'query': query,
            'error': str(e),
            'ai_response': f"I encountered an error processing your query: {str(e)}",
            'top_players': [],
            'total_candidates_found': 0,
            'search_metadata': {
                'timestamp': datetime.now().isoformat(),
                'error': True,
                'filter_first_approach': True
            }
        }

### example usage

In [40]:
QUERY = "young brazilian strikers under 25 from all european leagues"
TOP_N_RESULTS = 10
OUTPUT_FILE_PATH = "outputs/search_results.json" # saving results, might need to change path if needed

# Ensure output directory exists for search results
Path("outputs").mkdir(exist_ok=True)

start_time = datetime.now()
search_result = run_search(
    query=QUERY,
    top_k_initial=150, # Number of candidates from vector search before reranking
    top_k_rerank=40,   # Number of candidates after reranking
    top_n_final=TOP_N_RESULTS # Number of final players for Gemini to analyze
)
end_time = datetime.now()

search_result['processing_time_seconds'] = (end_time - start_time).total_seconds()

# Save to file if specified
if OUTPUT_FILE_PATH:
    with open(OUTPUT_FILE_PATH, 'w', encoding='utf-8') as f:
        json.dump(search_result, f, indent=2, ensure_ascii=False)
    logger.info(f"Results saved to: {OUTPUT_FILE_PATH}")


if search_result['success']:
    print(f"\n--- Search Completed Successfully! ---")
    print(f"Query: {search_result['query']}")
    print(f"AI Response: \n{search_result['ai_response']}")
    
    metadata = search_result.get('search_metadata', {})
    print(f"\nSearch Metadata:")
    print(f"  Total Candidates Found (after RAG steps): {search_result['total_candidates_found']}")
    print(f"  Top Players Recommended: {len(search_result['top_players'])}")
    print(f"  Original Dataset Size: {metadata.get('original_dataset_size', 'N/A')}")
    print(f"  Filtered Dataset Size: {metadata.get('filtered_dataset_size', 'N/A')}")
    print(f"  Filtering Efficiency: {metadata.get('filtering_efficiency', 'N/A')}")
    print(f"  Filters Applied: {metadata.get('filters_applied', 'None')}")
    print(f"  Processing Time: {search_result.get('processing_time_seconds', 0):.2f} seconds")
    
    if search_result['top_players']:
        print("\n--- Top Recommended Players Details (Summary) ---")
        for player in search_result['top_players']:
            basic = player['basic_info']
            print(f"  Rank {player['rank']}: {basic['name']} ({basic['age']}, {basic['position']}) from {basic['current_club']} (MV: €{basic['market_value']:,})")
else:
    print(f"\n--- Search Failed ---")
    print(f"Error: {search_result['error']}")
    print(f"Query: {search_result['query']}")

2025-06-26 15:01:00,142 - INFO - Starting FILTER-FIRST RAG pipeline for query: 'young brazilian strikers under 25 from all european leagues'
2025-06-26 15:01:00,146 - INFO - Parsing query: 'young brazilian strikers under 25 from all european leagues'
2025-06-26 15:01:00,147 - INFO - Detected position: centre-forward
2025-06-26 15:01:00,148 - INFO - Detected nationality (adjective): brazilian -> Brazil
2025-06-26 15:01:00,148 - INFO - Detected max age: 25
2025-06-26 15:01:00,149 - INFO - Final extracted filters: {'position': 'centre-forward', 'nationality': 'Brazil', 'max_age': 25}
2025-06-26 15:01:00,149 - INFO - Step 1: Pre-filtering dataset with filters: {'position': 'centre-forward', 'nationality': 'Brazil', 'max_age': 25}
2025-06-26 15:01:00,177 - INFO - Pre-filtering results:
2025-06-26 15:01:00,178 - INFO -   Total players processed: 6148
2025-06-26 15:01:00,178 - INFO -   Players passing all filters: 23
2025-06-26 15:01:00,179 - INFO -   position_rejected: 5402 players rejected


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-26 15:01:00,550 - INFO - Reranked to top 23 most relevant players
2025-06-26 15:01:00,551 - INFO - Rerank score range: nan to nan
2025-06-26 15:01:00,551 - INFO - Step 4: Preparing player data for 23 players...
2025-06-26 15:01:00,553 - INFO - Prepared data for 23 players
2025-06-26 15:01:00,553 - INFO - Step 5: Generating Gemini response...
2025-06-26 15:01:00,554 - INFO - FILTER-FIRST RAG pipeline completed successfully!
2025-06-26 15:01:00,554 - INFO - Dataset reduction: 6148 → 23 players (0.4% retained)
2025-06-26 15:01:00,555 - INFO - Results saved to: outputs/search_results.json



--- Search Completed Successfully! ---
Query: young brazilian strikers under 25 from all european leagues
AI Response: 
Based on your query 'young brazilian strikers under 25 from all european leagues', I've identified 10 potential candidates. My top recommendation is João Pedro, a 23-year-old Centre-Forward from Brazil. Currently at Brighton and Hove Albion Football Club, they have scored 32 goals in 102 appearances (0.31 per game). With a market value of €50,000,000, they represent excellent value for your scouting requirements. Other strong candidates include Daniel Silva and Carlos Eduardo, each bringing unique strengths to consider.

Search Metadata:
  Total Candidates Found (after RAG steps): 10
  Top Players Recommended: 10
  Original Dataset Size: 6148
  Filtered Dataset Size: 23
  Filtering Efficiency: 23/6148 players retained
  Filters Applied: {'position': 'centre-forward', 'nationality': 'Brazil', 'max_age': 25}
  Processing Time: 0.41 seconds

--- Top Recommended Players 