In [1]:
import pandas as pd
import pyarrow as pa
import schemas
from kaggle.api.kaggle_api_extended import KaggleApi

In [2]:
def download_dataset(dataset_name="davidcariboo/player-scores", destination_dir="../dataset/raw/"):
    api = KaggleApi()
    api.authenticate()

    api.dataset_download_files(dataset_name, path=destination_dir, unzip=True)

In [3]:
df = pd.read_csv("../dataset/raw/appearances.csv", dtype=schemas.Appearances.schema,
                parse_dates=schemas.Appearances.date_cols)
df.dtypes

appearance_id             string[python]
game_id                   string[python]
player_id                 string[python]
player_club_id            string[python]
player_current_club_id    string[python]
date                      datetime64[ns]
player_name               string[python]
competition_id            string[python]
yellow_cards                       Int64
red_cards                          Int64
goals                              Int64
assists                            Int64
minutes_played                   Float64
dtype: object

In [4]:
df = pd.read_csv("../dataset/raw/club_games.csv", dtype=schemas.Club_Games.schema)
df.dtypes

game_id                           Int64
club_id                           Int64
own_goals                         Int64
own_position             string[python]
own_manager_name         string[python]
opponent_id                       Int64
opponent_goals                    Int64
opponent_position        string[python]
opponent_manager_name    string[python]
hosting                  string[python]
is_win                            Int64
dtype: object

In [5]:
df = pd.read_csv("../dataset/raw/clubs.csv", dtype=schemas.Clubs.schema)
df.dtypes

club_id                             Int64
club_code                  string[python]
name                       string[python]
domestic_competition_id    string[python]
total_market_value                Float64
squad_size                          Int64
average_age                       Float64
foreigners_number                   Int64
foreigners_percentage             Float64
national_team_players               Int64
stadium_name               string[python]
stadium_seats                       Int64
net_transfer_record        string[python]
coach_name                 string[python]
last_season                         Int64
filename                   string[python]
url                        string[python]
dtype: object

In [6]:
df = pd.read_csv("../dataset/raw/competitions.csv", dtype=schemas.Competitions.schema)
df.dtypes

competition_id              string[python]
competition_code            string[python]
name                        string[python]
sub_type                    string[python]
type                        string[python]
country_id                           Int64
country_name                string[python]
domestic_league_code        string[python]
confederation               string[python]
url                         string[python]
is_major_national_league           boolean
dtype: object

In [7]:
df = pd.read_csv("../dataset/raw/game_events.csv", dtype=schemas.Game_Events.schema)
df.dtypes

game_event_id       string[python]
date                string[python]
game_id                      Int64
minute                       Int64
type                string[python]
club_id                      Int64
player_id                    Int64
description         string[python]
player_in_id               Float64
player_assist_id           Float64
dtype: object

In [8]:
df = pd.read_csv("../dataset/raw/game_lineups.csv", dtype=schemas.Game_Lineups.schema,
                 parse_dates=schemas.Game_Lineups.date_cols)
df.dtypes

game_lineups_id    string[python]
date               datetime64[ns]
game_id                     Int64
player_id                   Int64
club_id                     Int64
player_name        string[python]
type               string[python]
position           string[python]
number             string[python]
team_captain                Int64
dtype: object

In [9]:
df = pd.read_csv("../dataset/raw/games.csv", dtype=schemas.Games.schema,
                 parse_dates=schemas.Games.date_cols)
df.dtypes

game_id                            Int64
competition_id            string[python]
season                             Int64
round                     string[python]
date                      datetime64[ns]
home_club_id                       Int64
away_club_id                       Int64
home_club_goals                    Int64
away_club_goals                    Int64
home_club_position               Float64
away_club_position               Float64
home_club_manager_name    string[python]
away_club_manager_name    string[python]
stadium                   string[python]
attendance                       Float64
referee                   string[python]
url                       string[python]
home_club_formation       string[python]
away_club_formation       string[python]
home_club_name            string[python]
away_club_name            string[python]
aggregate                 string[python]
competition_type          string[python]
dtype: object

In [10]:
df = pd.read_csv("../dataset/raw/player_valuations.csv", dtype=schemas.Player_Valuations.schema,
                 parse_dates=schemas.Player_Valuations.date_cols)
df.dtypes

player_id                                       Int64
date                                   datetime64[ns]
market_value_in_eur                             Int64
current_club_id                                 Int64
player_club_domestic_competition_id    string[python]
dtype: object

In [11]:
df.tail(10)

Unnamed: 0,player_id,date,market_value_in_eur,current_club_id,player_club_domestic_competition_id
466892,769063,2024-03-21,5000000,3302,ES1
466893,826224,2024-03-21,10000000,12321,ES1
466894,844637,2024-03-21,3000000,3302,ES1
466895,895480,2024-03-21,1000000,621,ES1
466896,937955,2024-03-21,7500000,131,ES1
466897,937958,2024-03-21,75000000,131,ES1
466898,938158,2024-03-21,7500000,131,ES1
466899,941869,2024-03-21,500000,3709,ES1
466900,962110,2024-03-21,25000000,131,ES1
466901,991268,2024-03-21,40000000,1108,ES1
