In [3]:
import os
import pandas as pd
import sqlite3 as sql

In [4]:
path = '../data/understat/understat_game_data/understat_lineup_game_stats.db'

In [5]:
con = sql.connect(path)

lineup = pd.read_sql("""SELECT player.*,
                                game.date,
                                clubs.club
                     FROM lineup_stats AS player
                    JOIN general_game_stats AS game ON game.id = player.match_id
                    JOIN clubs ON clubs.club_id = player.team_id""",con)
con.close()
lineup.head()

Unnamed: 0,match_id,goals,own_goals,shots,xG,time,player_id,team_id,position,player,...,roster_in,roster_out,key_passes,assists,xA,xGChain,xGBuildup,positionOrder,date,club
0,81,0,0,0,0.0,90,560,89,GK,Sergio Romero,...,0,0,0,0,0.0,0.0,0.0,1,2015-08-08 15:45:00,Manchester United
1,81,0,0,0,0.0,90,560,89,GK,Sergio Romero,...,0,0,0,0,0.0,0.0,0.0,1,2015-08-08 15:45:00,Manchester United
2,81,0,0,0,0.0,90,560,89,GK,Sergio Romero,...,0,0,0,0,0.0,0.0,0.0,1,2015-08-08 15:45:00,Manchester United
3,81,0,0,0,0.0,90,560,89,GK,Sergio Romero,...,0,0,0,0,0.0,0.0,0.0,1,2015-08-08 15:45:00,Manchester United
4,81,0,0,0,0.0,90,560,89,GK,Sergio Romero,...,0,0,0,0,0.0,0.0,0.0,1,2015-08-08 15:45:00,Manchester United


In [6]:
lineup.columns

Index(['match_id', 'goals', 'own_goals', 'shots', 'xG', 'time', 'player_id',
       'team_id', 'position', 'player', 'h_a', 'yellow_card', 'red_card',
       'roster_in', 'roster_out', 'key_passes', 'assists', 'xA', 'xGChain',
       'xGBuildup', 'positionOrder', 'date', 'club'],
      dtype='object')

In [7]:
#Get rid of useless columns
lineup = lineup[['date','player', 'goals', 'own_goals', 'shots', 'xG', 'time',
       'club', 'position',  'h_a', 'yellow_card', 'red_card', 'key_passes', 'assists', 'xA', 'xGChain',
       'xGBuildup']]

In [8]:
lineup.head()

Unnamed: 0,date,player,goals,own_goals,shots,xG,time,club,position,h_a,yellow_card,red_card,key_passes,assists,xA,xGChain,xGBuildup
0,2015-08-08 15:45:00,Sergio Romero,0,0,0,0.0,90,Manchester United,GK,h,0,0,0,0,0.0,0.0,0.0
1,2015-08-08 15:45:00,Sergio Romero,0,0,0,0.0,90,Manchester United,GK,h,0,0,0,0,0.0,0.0,0.0
2,2015-08-08 15:45:00,Sergio Romero,0,0,0,0.0,90,Manchester United,GK,h,0,0,0,0,0.0,0.0,0.0
3,2015-08-08 15:45:00,Sergio Romero,0,0,0,0.0,90,Manchester United,GK,h,0,0,0,0,0.0,0.0,0.0
4,2015-08-08 15:45:00,Sergio Romero,0,0,0,0.0,90,Manchester United,GK,h,0,0,0,0,0.0,0.0,0.0


In [9]:
df_transfer = pd.read_csv('combined_output.csv')

In [10]:
df_transfer.head()

Unnamed: 0,player_id,name,last_season,current_club_id,date_of_birth,sub_position,position,foot,height_in_cm,date,market_value_in_eur_x,transfer_date,transfer_season,from_club_id,to_club_id,from_club_name,to_club_name,transfer_fee,market_value_in_eur_y
0,10,Miroslav Klose,2015,398,1978-06-09 00:00:00,Centre-Forward,Attack,right,184.0,2004-10-04,7000000.0,,,,,,,,
1,10,Miroslav Klose,2015,398,1978-06-09 00:00:00,Centre-Forward,Attack,right,184.0,2005-01-07,9000000.0,,,,,,,,
2,10,Miroslav Klose,2015,398,1978-06-09 00:00:00,Centre-Forward,Attack,right,184.0,2005-05-05,12000000.0,,,,,,,,
3,10,Miroslav Klose,2015,398,1978-06-09 00:00:00,Centre-Forward,Attack,right,184.0,2005-09-30,15000000.0,,,,,,,,
4,10,Miroslav Klose,2015,398,1978-06-09 00:00:00,Centre-Forward,Attack,right,184.0,2006-01-09,20000000.0,,,,,,,,


In [11]:
df_transfer.columns

Index(['player_id', 'name', 'last_season', 'current_club_id', 'date_of_birth',
       'sub_position', 'position', 'foot', 'height_in_cm', 'date',
       'market_value_in_eur_x', 'transfer_date', 'transfer_season',
       'from_club_id', 'to_club_id', 'from_club_name', 'to_club_name',
       'transfer_fee', 'market_value_in_eur_y'],
      dtype='object')

In [12]:
#Ensure date columns are in datetime format
lineup['date'] = pd.to_datetime(lineup['date'])
df_transfer['date'] = pd.to_datetime(df_transfer['date'])

In [13]:
# Drop rows where 'date' is NaN
df_transfer = df_transfer.dropna(subset=['date'])
lineup = lineup.dropna(subset=['date'])

In [14]:
# Rename player column for consistency
lineup = lineup.rename(columns={'player': 'name'})

In [15]:
# Sort transfer data by 'name' and 'date'
df_transfer_sorted = df_transfer.sort_values(['name', 'date']).reset_index(drop=True)


In [16]:
# Function to process chunks from lineup
def process_chunk(chunk):
    chunk_sorted = chunk.sort_values(['name', 'date']).reset_index(drop=True)

    merged_list = []
    for name, group in chunk_sorted.groupby('name'):
        if name in df_transfer_sorted['name'].values:
            transfer_data = df_transfer_sorted[df_transfer_sorted['name'] == name]
            transfer_data = transfer_data.sort_values('date').reset_index(drop=True)
            merged_group = pd.merge_asof(
                group,
                transfer_data[['date', 'market_value_in_eur_x']],
                on='date',
                direction='backward'
            )
            merged_list.append(merged_group)
    
    return pd.concat(merged_list)

In [17]:
# Initialize an empty list to store processed chunks
chunk_list = []

chunk_size=200000 #change according to system specs. With this chunk_size, it took ~12 minutes in my computer

# Process the lineup DataFrame in chunks
print(len(lineup))
for start in range(0, len(lineup), chunk_size):
    chunk = lineup.iloc[start:start + chunk_size]
    merged_chunk = process_chunk(chunk)
    chunk_list.append(merged_chunk)
    print(start)

# Concatenate all processed chunks
result = pd.concat(chunk_list).reset_index(drop=True)

result.to_csv('result.csv', index=False)

result[['date', 'name', 'market_value_in_eur_x']].head()

4855643
0


In [43]:
player_name = "Erling Haaland"  # Specify the player's name you want to filter

# Filter the result DataFrame for the specified player
player_data = result.loc[result['name'] == player_name]

# Display the filtered data
player_data

# result[['date', 'name', 'market_value_in_eur_x']]

Unnamed: 0,date,name,goals,own_goals,shots,xG,time,club,position,h_a,yellow_card,red_card,key_passes,assists,xA,xGChain,xGBuildup,market_value_in_eur_x
1163525,2020-01-18 14:30:00,Erling Haaland,3,0,3,1.322790,33,Borussia Dortmund,Sub,a,0,0,0,0,0.000000,1.346000,0.023214,45000000.0
1163526,2020-01-18 14:30:00,Erling Haaland,3,0,3,1.322790,33,Borussia Dortmund,Sub,a,0,0,0,0,0.000000,1.346000,0.023214,45000000.0
1163527,2020-01-18 14:30:00,Erling Haaland,3,0,3,1.322790,33,Borussia Dortmund,Sub,a,0,0,0,0,0.000000,1.346000,0.023214,45000000.0
1163528,2020-01-18 14:30:00,Erling Haaland,3,0,3,1.322790,33,Borussia Dortmund,Sub,a,0,0,0,0,0.000000,1.346000,0.023214,45000000.0
1163529,2020-01-18 14:30:00,Erling Haaland,3,0,3,1.322790,33,Borussia Dortmund,Sub,a,0,0,0,0,0.000000,1.346000,0.023214,45000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1164982,2024-09-28 11:30:00,Erling Haaland,0,0,5,0.391001,90,Manchester City,FW,a,0,0,1,0,0.118835,0.673501,0.163665,180000000.0
1164983,2024-09-28 11:30:00,Erling Haaland,0,0,5,0.391001,90,Manchester City,FW,a,0,0,1,0,0.118835,0.673501,0.163665,180000000.0
1164984,2024-09-28 11:30:00,Erling Haaland,0,0,5,0.391001,90,Manchester City,FW,a,0,0,1,0,0.118835,0.673501,0.163665,180000000.0
1164985,2024-09-28 11:30:00,Erling Haaland,0,0,5,0.391001,90,Manchester City,FW,a,0,0,1,0,0.118835,0.673501,0.163665,180000000.0
