In [3]:
import pandas as pd 
import re
from collections import Counter

# **LA LIGA**


In [None]:
file = pd.read_csv("Datasets/el_clasico_matches_la_liga.csv")
df = pd.DataFrame(file)

In [135]:
df.head()

Unnamed: 0,No.,Date,Matchweek,Home team,Away team,Score (FT/HT),Goals (home),Goals (away)
0,1,17 February 1929,2,Barcelona,Real Madrid,1–2 (0–1),Parera (70),"Morera (10, 55)"
1,2,9 May 1929,11,Real Madrid,Barcelona,0–1 (0–0),,Sastre (83)
2,3,26 January 1930,9,Barcelona,Real Madrid,1–4 (0–3),Bestit (63),"Rubio (10, 37), F. López (17), Lazcano (71)"
3,4,30 March 1930,18,Real Madrid,Barcelona,5–1 (3–0),"Rubio (5, 23), Lazcano (42, 68, 72)",Goiburu (84)
4,5,1 February 1931,9,Real Madrid,Barcelona,0–0,,


### **Change to Date Format**


In [136]:
df["Date"] = pd.to_datetime(df["Date"])

In [137]:
df.head()

Unnamed: 0,No.,Date,Matchweek,Home team,Away team,Score (FT/HT),Goals (home),Goals (away)
0,1,1929-02-17,2,Barcelona,Real Madrid,1–2 (0–1),Parera (70),"Morera (10, 55)"
1,2,1929-05-09,11,Real Madrid,Barcelona,0–1 (0–0),,Sastre (83)
2,3,1930-01-26,9,Barcelona,Real Madrid,1–4 (0–3),Bestit (63),"Rubio (10, 37), F. López (17), Lazcano (71)"
3,4,1930-03-30,18,Real Madrid,Barcelona,5–1 (3–0),"Rubio (5, 23), Lazcano (42, 68, 72)",Goiburu (84)
4,5,1931-02-01,9,Real Madrid,Barcelona,0–0,,


In [138]:
df = df.dropna(subset=["Score (FT/HT)"])

In [139]:
df.shape

(189, 8)

In [140]:
df.tail()

Unnamed: 0,No.,Date,Matchweek,Home team,Away team,Score (FT/HT),Goals (home),Goals (away)
184,185,2022-10-16,9,Real Madrid,Barcelona,3–1 (2–0),"Benzema (12), Valverde (35), Rodrygo (90+1 p.)",Torres (83)
185,186,2023-03-19,26,Barcelona,Real Madrid,2–1 (1–1),"Roberto (45), Kessié (90+1)",Araújo (9 o.g.)
186,187,2023-10-28,11,Barcelona,Real Madrid,1–2 (1–0),Gündoğan (6),"Bellingham (68, 90+2)"
187,188,2024-04-21,32,Real Madrid,Barcelona,3–2 (1–1),"Vinícius (18 p.), Vázquez (73), Bellingham (90+1)","Christensen (6), López (69)"
188,189,2024-10-26,11,Real Madrid,Barcelona,0–4 (0–0),,"Lewandowski (54, 56), Yamal (77), Raphinha (84)"


### **Goals**


In [141]:
df["Home Goals"] = df["Score (FT/HT)"].str.extract(r"(\d+)–(\d+)")[0].astype(int)
df["Away Goals"] = df["Score (FT/HT)"].str.extract(r"(\d+)–(\d+)")[1].astype(int)

In [143]:
df["Barcelona Goals"] = df.apply(lambda row: row["Home Goals"] if row["Home team"] == "Barcelona" else row["Away Goals"], axis=1)
df["Real Madrid Goals"] = df.apply(lambda row: row["Away Goals"] if row["Home team"] == "Barcelona" else row["Home Goals"], axis=1)

### **Winner Column**


In [145]:
def determine_winner(row):
    if row["Barcelona Goals"] > row["Real Madrid Goals"]:
        return "Barcelona"
    elif row["Barcelona Goals"] < row["Real Madrid Goals"]:
        return "Real Madrid"
    else:
        return "Draw"

df["Winner"] = df.apply(determine_winner, axis=1)

### **Venue Mapping**


In [146]:
venue_map = {
    "Barcelona": "Camp Nou",
    "Real Madrid": "Santiago Bernabéu Stadium"
}
df["Venue"] = df["Home team"].map(venue_map)

In [147]:
df.head()

Unnamed: 0,No.,Date,Matchweek,Home team,Away team,Score (FT/HT),Goals (home),Goals (away),Home Goals,Away Goals,Barcelona Goals,Real Madrid Goals,Winner,Venue
0,1,1929-02-17,2,Barcelona,Real Madrid,1–2 (0–1),Parera (70),"Morera (10, 55)",1,2,1,2,Real Madrid,Camp Nou
1,2,1929-05-09,11,Real Madrid,Barcelona,0–1 (0–0),,Sastre (83),0,1,1,0,Barcelona,Santiago Bernabéu Stadium
2,3,1930-01-26,9,Barcelona,Real Madrid,1–4 (0–3),Bestit (63),"Rubio (10, 37), F. López (17), Lazcano (71)",1,4,1,4,Real Madrid,Camp Nou
3,4,1930-03-30,18,Real Madrid,Barcelona,5–1 (3–0),"Rubio (5, 23), Lazcano (42, 68, 72)",Goiburu (84),5,1,1,5,Real Madrid,Santiago Bernabéu Stadium
4,5,1931-02-01,9,Real Madrid,Barcelona,0–0,,,0,0,0,0,Draw,Santiago Bernabéu Stadium


### **Scorers**


In [148]:
df["Barcelona Scorers"] = df.apply(lambda row: row["Goals (home)"] if row["Home team"] == "Barcelona" else row["Goals (away)"], axis=1)
df["Real Madrid Scorers"] = df.apply(lambda row: row["Goals (away)"] if row["Home team"] == "Barcelona" else row["Goals (home)"], axis=1)

In [188]:
df.head()

Unnamed: 0,No.,Date,Matchweek,Home team,Away team,Score (FT/HT),Goals (home),Goals (away),Home Goals,Away Goals,Barcelona Goals,Real Madrid Goals,Winner,Venue,Barcelona Scorers,Real Madrid Scorers,Real_Madrid_Own_Goals,Barcelona_Own_Goals,Barcelona Scorers Clean,Real Madrid Scorers Clean
0,1,1929-02-17,2,Barcelona,Real Madrid,1–2 (0–1),Parera (70),"Morera (10, 55)",1,2,1,2,Real Madrid,Camp Nou,Parera (70),"Morera (10), Morera (55)",,,Parera,"Morera, Morera"
1,2,1929-05-09,11,Real Madrid,Barcelona,0–1 (0–0),,Sastre (83),0,1,1,0,Barcelona,Santiago Bernabéu Stadium,Sastre (83),,,,Sastre,
2,3,1930-01-26,9,Barcelona,Real Madrid,1–4 (0–3),Bestit (63),"Rubio (10, 37), F. López (17), Lazcano (71)",1,4,1,4,Real Madrid,Camp Nou,Bestit (63),"Rubio (10), Rubio (37), López (17), Lazcano (71)",,,Bestit,"Rubio, Rubio, López, Lazcano"
3,4,1930-03-30,18,Real Madrid,Barcelona,5–1 (3–0),"Rubio (5, 23), Lazcano (42, 68, 72)",Goiburu (84),5,1,1,5,Real Madrid,Santiago Bernabéu Stadium,Goiburu (84),"Rubio (5), Rubio (23), Lazcano (42), Lazcano (...",,,Goiburu,"Rubio, Rubio, Lazcano, Lazcano, Lazcano"
4,5,1931-02-01,9,Real Madrid,Barcelona,0–0,,,0,0,0,0,Draw,Santiago Bernabéu Stadium,,,,,,


In [None]:
def extract_own_goals(row, team_col, scorers_col):
        own_goals = []
        if pd.notna(row[scorers_col]):
            scorers = str(row[scorers_col]).split(', ')
            own_goals = [scorer.split(' ')[0] for scorer in scorers if 'o.g.' in scorer]
        return ', '.join(own_goals) if own_goals else None

def split_multiple_goals(text):
    """
    Splits multiple goals by same player into separate entries.
    Example: "Lewandowski (54, 56)" -> ["Lewandowski (54)", "Lewandowski (56)"]
    """
    if pd.isna(text):
        return None
    
    results = []
    # Find patterns like "Player (min1, min2)" or "Player (min1, min2+extra)"
    matches = re.finditer(r'(\w+)\s*\(((?:\d+(?:\+\d+)?(?:,\s*)?)+)\)', text)
    
    for match in matches:
        player = match.group(1)
        minutes = match.group(2).split(',')
        for minute in minutes:
            minute = minute.strip()
            if minute:  # Skip empty strings
                results.append(f"{player} ({minute})")
    
    return results

def remove_minutes(text):
    """
    Removes minutes from player names.
    Example: "Christensen (6)" -> "Christensen"
    """
    if pd.isna(text):
        return None
    
    # Replace patterns like "(number)" or "(number+number)"
    return re.sub(r'\s*\(\d+(?:\+\d+)?\s*(?:o\.g\.)?\)', '', text)

# Example usage with a pandas DataFrame:
def transform_match_data(df):
    """
    Applies all transformations to the DataFrame
    """
    # Create new columns for own goals
    df['Real_Madrid_Own_Goals'] = df.apply(lambda x: extract_own_goals(x, 'Home team', 'Barcelona Scorers'), axis=1) 
    df['Barcelona_Own_Goals'] = df.apply(lambda x: extract_own_goals(x, 'Away team', 'Real Madrid Scorers'), axis=1)
    
    # Split multiple goals
    df['Barcelona Scorers'] = df['Barcelona Scorers'].apply(lambda x: ', '.join(split_multiple_goals(x)) if not pd.isna(x) else None)
    df['Real Madrid Scorers'] = df['Real Madrid Scorers'].apply(lambda x: ', '.join(split_multiple_goals(x)) if not pd.isna(x) else None)
    
    # Remove minutes
    df['Barcelona Scorers Clean'] = df['Barcelona Scorers'].apply(remove_minutes)
    df['Real Madrid Scorers Clean'] = df['Real Madrid Scorers'].apply(remove_minutes)
    
    return df

In [189]:
df = transform_match_data(df)

### **New Dataframe**


In [192]:
columns={
            "Barcelona Goals": "Barcelona_Goals", 
            "Real Madrid Goals": "Real_Madrid_Goals",
            "Barcelona Scorers Clean" : "Barcelona_Scorers",
            "Real Madrid Scorers Clean": "Real_Madrid_Scorers"
         }
new_df = df.filter(items=["Date", "Venue", "Barcelona Goals", "Real Madrid Goals", "Winner", "Real_Madrid_Own_Goals", "Barcelona_Own_Goals","Barcelona Scorers Clean", "Real Madrid Scorers Clean"]).rename(columns)

In [193]:
new_df.head()

Unnamed: 0,Date,Venue,Barcelona Goals,Real Madrid Goals,Winner,Real_Madrid_Own_Goals,Barcelona_Own_Goals,Barcelona Scorers Clean,Real Madrid Scorers Clean
0,1929-02-17,Camp Nou,1,2,Real Madrid,,,Parera,"Morera, Morera"
1,1929-05-09,Santiago Bernabéu Stadium,1,0,Barcelona,,,Sastre,
2,1930-01-26,Camp Nou,1,4,Real Madrid,,,Bestit,"Rubio, Rubio, López, Lazcano"
3,1930-03-30,Santiago Bernabéu Stadium,1,5,Real Madrid,,,Goiburu,"Rubio, Rubio, Lazcano, Lazcano, Lazcano"
4,1931-02-01,Santiago Bernabéu Stadium,0,0,Draw,,,,


In [195]:
new_df.to_csv("Datasets/Cleaned_ds/la_liga.csv", index=False)

# **UEFA**


In [4]:
file = pd.read_csv("Datasets/el_clasico_matches_uefa.csv")
df = pd.DataFrame(file)

In [5]:
df.head()

Unnamed: 0,Season,Round,Round.1,Home team,Away team,Score (FT/HT),Goals (home),Goals (away)
0,1959–60,Semi-finals,First leg,Real Madrid,Barcelona,3–1 (2–1),"Di Stéfano (17, 84), Puskás (28)",Martínez (37)
1,1959–60,Semi-finals,Second leg,Barcelona,Real Madrid,1–3 (1–1),Kocsis (89),"Puskás (25, 75), Gento (68)"
2,1960–61,First round,First leg,Real Madrid,Barcelona,2–2 (2–1),"Mateos (3), Gento (33)","Luis Suárez (27, 87 p.)"
3,1960–61,First round,Second leg,Barcelona,Real Madrid,2–1 (1–0),"Vergés (33), Evaristo (82)",Canário (87)
4,2001–02,Semi-finals,First leg,Barcelona,Real Madrid,0–2 (0–0),,"Zidane (55), McManaman (90+2)"
