# Advanced Databases 2024/2025

## Project

**Group Number:** 15

**Group Elements:**
- Martim Emauz
- Miguel Martins
- Rúben Torres

### Imports

In [None]:
from pymongo import MongoClient
import mysql.connector
import pandas as pd
import random

### Utils

In [None]:
teams_data = {
    "Manchester City": {"city": "Manchester", "region": "North"},
    "Arsenal": {"city": "London", "region": "South"},
    "Liverpool": {"city": "Liverpool", "region": "North"},
    "Aston Villa": {"city": "Birmingham", "region": "Midlands"},
    "Tottenham Hotspur": {"city": "London", "region": "South"},
    "Chelsea": {"city": "London", "region": "South"},
    "Newcastle United": {"city": "Newcastle", "region": "North"},
    "Manchester United": {"city": "Manchester", "region": "North"},
    "West Ham United": {"city": "London", "region": "South"},
    "Crystal Palace": {"city": "London", "region": "South"},
    "Brighton and Hove Albion": {"city": "Brighton", "region": "South"},
    "AFC Bournemouth": {"city": "Bournemouth", "region": "South"},
    "Fulham": {"city": "London", "region": "South"},
    "Wolverhampton Wanderers": {"city": "Wolverhampton", "region": "Midlands"},
    "Everton": {"city": "Liverpool", "region": "North"},
    "Brentford": {"city": "Brentford", "region": "South"},
    "Nottingham Forest": {"city": "Nottingham", "region": "Midlands"},
    "Luton Town": {"city": "Luton", "region": "South"},
    "Burnley": {"city": "Burnley", "region": "North"},
    "Sheffield United": {"city": "Sheffield", "region": "North"},
}

### DataFrames - Core

In [None]:
df_coaches = pd.read_csv("./dataset/coaches.csv").replace("&", "and", regex=True)
df_players = pd.read_csv("./dataset/player_player_ratings.csv")
df_teams = pd.read_csv("./dataset/pl_table_2023_24.csv").replace("&", "and", regex=True)

### DataFrames - Team Stats

In [None]:
df_accurate_cross_team = pd.read_csv("./dataset/accurate_cross_team.csv")[
    ["Team", "Cross Success (%)", "Accurate Crosses per Match"]
]
df_corner_taken_team = pd.read_csv("./dataset/corner_taken_team.csv")[
    ["Team", "Corners Taken"]
]
df_team_ratings = pd.read_csv("./dataset/team_ratings.csv")[
    ["Team", "FotMob Team Rating"]
]
df_touches_in_opp_box_team = pd.read_csv("./dataset/touches_in_opp_box_team.csv")[
    ["Team", "Touches in Opposition Box"]
]
df_possession_percentage_team = pd.read_csv("./dataset/possession_percentage_team.csv")[
    ["Team", "Possession (%)"]
]
df_team_stats = pd.read_csv("./dataset/pl_table_2023_24.csv").rename(
    columns={"name": "Team"}
)[["Team"]]
df_team_stats = (
    df_team_stats.merge(df_accurate_cross_team, on="Team", how="left")
    .merge(df_corner_taken_team, on="Team", how="left")
    .merge(df_team_ratings, on="Team", how="left")
    .merge(df_touches_in_opp_box_team, on="Team", how="left")
    .merge(df_possession_percentage_team, on="Team", how="left")
).replace("&", "and", regex=True)

### DataFrames - Player Stats

In [None]:
df_player_player_ratings = pd.read_csv("./dataset/player_player_ratings.csv")[
    ["Player", "FotMob Rating", "Player of the Match Awards"]
]
df_player_top_scorers = pd.read_csv("./dataset/player_top_scorers.csv")[
    ["Player", "Goals", "Penalties"]
]
df_player_expected_goals = pd.read_csv("./dataset/player_expected_goals.csv")[
    ["Player", "Expected Goals (xG)"]
]
df_player_big_chances_missed = pd.read_csv("./dataset/player_big_chances_missed.csv")[
    ["Player", "Big Chances Missed", "Shot Conversion Rate (%)"]
]
df_player_top_assists = pd.read_csv("./dataset/player_top_assists.csv")[
    ["Player", "Assists", "Secondary Assists"]
]
df_player_expected_assists = pd.read_csv("./dataset/player_expected_assists.csv")[
    ["Player", "Expected Assists (xA)"]
]
df_player_big_chances_created = pd.read_csv("./dataset/player_big_chances_created.csv")[
    ["Player", "Big Chances Created"]
]
df_player_accurate_passes = pd.read_csv("./dataset/player_accurate_passes.csv")[
    ["Player", "Accurate Passes per 90", "Pass Success (%)"]
]
df_player_accurate_long_balls = pd.read_csv("./dataset/player_accurate_long_balls.csv")[
    ["Player", "Accurate Long Balls per 90", "Successful Long Balls (%)"]
]
df_player_clean_sheets = pd.read_csv("./dataset/player_clean_sheets.csv")[
    ["Player", "Clean Sheets", "Goals Conceded"]
]
df_player_effective_clearances = pd.read_csv(
    "./dataset/player_effective_clearances.csv"
)[["Player", "Total Clearances"]]
df_player_interceptions = pd.read_csv("./dataset/player_interceptions.csv")[
    ["Player", "Total Interceptions"]
]
df_player_contests_won = pd.read_csv("./dataset/player_contests_won.csv")[
    ["Player", "Successful Dribbles per 90", "Dribble Success Rate (%)"]
]
df_player_yellow_cards = pd.read_csv("./dataset/player_yellow_cards.csv")[
    ["Player", "Yellow Cards"]
]
df_player_red_cards = pd.read_csv("./dataset/player_red_cards.csv")[
    ["Player", "Red Cards"]
]
df_player_fouls_committed = pd.read_csv("./dataset/player_fouls_committed.csv")[
    ["Player", "Fouls Committed per 90"]
]
df_player_stats = (
    df_player_player_ratings.merge(df_player_top_scorers, on="Player", how="left")
    .merge(df_player_expected_goals, on="Player", how="left")
    .merge(df_player_big_chances_missed, on="Player", how="left")
    .merge(df_player_top_assists, on="Player", how="left")
    .merge(df_player_expected_assists, on="Player", how="left")
    .merge(df_player_big_chances_created, on="Player", how="left")
    .merge(df_player_accurate_passes, on="Player", how="left")
    .merge(df_player_accurate_long_balls, on="Player", how="left")
    .merge(df_player_clean_sheets, on="Player", how="left")
    .merge(df_player_effective_clearances, on="Player", how="left")
    .merge(df_player_interceptions, on="Player", how="left")
    .merge(df_player_contests_won, on="Player", how="left")
    .merge(df_player_yellow_cards, on="Player", how="left")
    .merge(df_player_red_cards, on="Player", how="left")
    .merge(df_player_fouls_committed, on="Player", how="left")
).fillna(0)

### MongoDB - Core

In [None]:
client = MongoClient("mongodb://root:R00T_P4SSW0RD@localhost:27017/")
db = client["premier_league"]

# team (key), name, age, country, experience_years, titles;
df_coaches.columns = ["team", "name", "age", "country", "experience_years", "titles"]
coach = db["coach"]  # MySQL == coach + team_coach
coach_result = coach.insert_many(df_coaches.to_dict("records"))
print(f"Added {len(coach_result.inserted_ids)} coaches!")

# team (key), name, country, strong_foot, matches, minutes;
df_players = df_players.rename(
    columns={
        "Team": "team",
        "Player": "name",
        "Country": "country",
        "Matches": "matches",
        "Minutes": "minutes",
    }
)
df_players["strong_foot"] = [
    random.choice([True, False]) for _ in range(len(df_players))
]
columns = ["team", "name", "country", "strong_foot", "matches", "minutes"]
df_players = df_players[columns]
player = db["player"]  # MySQL == player + team_player
player_result = player.insert_many(df_players.to_dict("records"))
print(f"Added {len(player_result.inserted_ids)} players!")

# name (key), position_league, country, city, region, matches;
df_teams = df_teams.rename(
    columns={
        "idx": "position_league",
        "played": "matches",
    }
)
df_teams["country"] = "ENG"
df_teams["city"] = df_teams["name"].map(
    lambda team: teams_data.get(team, {}).get("city", "Unknown")
)
df_teams["region"] = df_teams["name"].map(
    lambda team: teams_data.get(team, {}).get("region", "Unknown")
)
columns = ["name", "position_league", "country", "city", "region", "matches"]
df_teams = df_teams[columns]
team = db["team"]
team_result = team.insert_many(df_teams.to_dict("records"))
print(f"Added {len(team_result.inserted_ids)} teams!")

### MongoDB - Stats

In [None]:
# team (key), cross_success_pct, accurate_crosses_per_90, corners_taken, team_rating, touches_in_opp_box, possession_pct
df_team_stats = df_team_stats.rename(
    columns={
        "Team": "team",
        "Cross Success (%)": "cross_success_pct",
        "Accurate Crosses per Match": "accurate_crosses_per_90",
        "Corners Taken": "corners_taken",
        "FotMob Team Rating": "team_rating",
        "Touches in Opposition Box": "touches_in_opp_box",
        "Possession (%)": "possession_pct",
    }
)
team_stats = db["team_stats"]  # MySQL == team_stats
team_stats_result = team_stats.insert_many(df_team_stats.to_dict("records"))
print(f"Added {len(team_stats_result.inserted_ids)} teams' stats!")

# player (key),
# player_rating, player_of_match,
# goals, expected_goals, shot_conversion_rate, penalties_scored,
# assists, expected_assists, secondary_assists, big_chances_created, big_chances_missed,
# pass_success_pct, accurate_passes_90, accurate_long_balls_90, successful_long_balls_pct,
# clean_sheets, goals_conceded, total_clearances, total_interceptions,
# successful_dribbles_90, dribble_success_rate,
# yellow_cards, red_cards, fouls_per_90;
df_player_stats = df_player_stats.rename(
    columns={
        "Player": "player",
        "FotMob Rating": "player_rating",
        "Player of the Match Awards": "player_of_match",
        "Goals": "goals",
        "Penalties": "penalties_scored",
        "Expected Goals (xG)": "expected_goals",
        "Big Chances Missed": "big_chances_missed",
        "Shot Conversion Rate (%)": "shot_conversion_rate",
        "Assists": "assists",
        "Secondary Assists": "secondary_assists",
        "Expected Assists (xA)": "expected_assists",
        "Big Chances Created": "big_chances_created",
        "Accurate Passes per 90": "accurate_passes_90",
        "Pass Success (%)": "pass_success_pct",
        "Accurate Long Balls per 90": "accurate_long_balls_90",
        "Successful Long Balls (%)": "successful_long_balls_pct",
        "Clean Sheets": "clean_sheets",
        "Goals Conceded": "goals_conceded",
        "Total Clearances": "total_clearances",
        "Total Interceptions": "total_interceptions",
        "Successful Dribbles per 90": "successful_dribbles_90",
        "Dribble Success Rate (%)": "dribble_success_rate",
        "Yellow Cards": "yellow_cards",
        "Red Cards": "red_cards",
        "Fouls Committed per 90": "fouls_per_90",
    }
)
player_stats = db["player_stats"]  # MySQL == player_stats
player_stats_result = player_stats.insert_many(df_player_stats.to_dict("records"))
print(f"Added {len(player_stats_result.inserted_ids)} players' stats!")

### MongoDB - Simple Queries

### MongoDB - Complex Queries

### MySQL - Core

### MySQL - Stats

### MySQL - Simple Queries

### MySQL - Complex Queries