# **IMPORTS**


In [3]:
import pandas as pd 
import re
from collections import Counter

# **LA LIGA**


### **Import**


In [None]:
file = pd.read_csv("Datasets/el_clasico_matches_la_liga.csv")
df = pd.DataFrame(file)

In [135]:
df.head()

Unnamed: 0,No.,Date,Matchweek,Home team,Away team,Score (FT/HT),Goals (home),Goals (away)
0,1,17 February 1929,2,Barcelona,Real Madrid,1–2 (0–1),Parera (70),"Morera (10, 55)"
1,2,9 May 1929,11,Real Madrid,Barcelona,0–1 (0–0),,Sastre (83)
2,3,26 January 1930,9,Barcelona,Real Madrid,1–4 (0–3),Bestit (63),"Rubio (10, 37), F. López (17), Lazcano (71)"
3,4,30 March 1930,18,Real Madrid,Barcelona,5–1 (3–0),"Rubio (5, 23), Lazcano (42, 68, 72)",Goiburu (84)
4,5,1 February 1931,9,Real Madrid,Barcelona,0–0,,


### **Change to Date Format**


In [136]:
df["Date"] = pd.to_datetime(df["Date"])

In [137]:
df.head()

Unnamed: 0,No.,Date,Matchweek,Home team,Away team,Score (FT/HT),Goals (home),Goals (away)
0,1,1929-02-17,2,Barcelona,Real Madrid,1–2 (0–1),Parera (70),"Morera (10, 55)"
1,2,1929-05-09,11,Real Madrid,Barcelona,0–1 (0–0),,Sastre (83)
2,3,1930-01-26,9,Barcelona,Real Madrid,1–4 (0–3),Bestit (63),"Rubio (10, 37), F. López (17), Lazcano (71)"
3,4,1930-03-30,18,Real Madrid,Barcelona,5–1 (3–0),"Rubio (5, 23), Lazcano (42, 68, 72)",Goiburu (84)
4,5,1931-02-01,9,Real Madrid,Barcelona,0–0,,


In [138]:
df = df.dropna(subset=["Score (FT/HT)"])

In [139]:
df.shape

(189, 8)

In [140]:
df.tail()

Unnamed: 0,No.,Date,Matchweek,Home team,Away team,Score (FT/HT),Goals (home),Goals (away)
184,185,2022-10-16,9,Real Madrid,Barcelona,3–1 (2–0),"Benzema (12), Valverde (35), Rodrygo (90+1 p.)",Torres (83)
185,186,2023-03-19,26,Barcelona,Real Madrid,2–1 (1–1),"Roberto (45), Kessié (90+1)",Araújo (9 o.g.)
186,187,2023-10-28,11,Barcelona,Real Madrid,1–2 (1–0),Gündoğan (6),"Bellingham (68, 90+2)"
187,188,2024-04-21,32,Real Madrid,Barcelona,3–2 (1–1),"Vinícius (18 p.), Vázquez (73), Bellingham (90+1)","Christensen (6), López (69)"
188,189,2024-10-26,11,Real Madrid,Barcelona,0–4 (0–0),,"Lewandowski (54, 56), Yamal (77), Raphinha (84)"


### **Goals**


In [141]:
df["Home Goals"] = df["Score (FT/HT)"].str.extract(r"(\d+)–(\d+)")[0].astype(int)
df["Away Goals"] = df["Score (FT/HT)"].str.extract(r"(\d+)–(\d+)")[1].astype(int)

In [143]:
df["Barcelona Goals"] = df.apply(lambda row: row["Home Goals"] if row["Home team"] == "Barcelona" else row["Away Goals"], axis=1)
df["Real Madrid Goals"] = df.apply(lambda row: row["Away Goals"] if row["Home team"] == "Barcelona" else row["Home Goals"], axis=1)

### **Winner Column**


In [42]:
def determine_winner(row):
    if row["Barcelona Goals"] > row["Real Madrid Goals"]:
        return "Barcelona"
    elif row["Barcelona Goals"] < row["Real Madrid Goals"]:
        return "Real Madrid"
    else:
        return "Draw"



In [None]:
df["Winner"] = df.apply(determine_winner, axis=1)

### **Venue Mapping**


In [45]:
venue_map = {
    "Barcelona": "Camp Nou",
    "Real Madrid": "Santiago Bernabéu Stadium"
}


In [None]:
df["Venue"] = df["Home team"].map(venue_map)

In [147]:
df.head()

Unnamed: 0,No.,Date,Matchweek,Home team,Away team,Score (FT/HT),Goals (home),Goals (away),Home Goals,Away Goals,Barcelona Goals,Real Madrid Goals,Winner,Venue
0,1,1929-02-17,2,Barcelona,Real Madrid,1–2 (0–1),Parera (70),"Morera (10, 55)",1,2,1,2,Real Madrid,Camp Nou
1,2,1929-05-09,11,Real Madrid,Barcelona,0–1 (0–0),,Sastre (83),0,1,1,0,Barcelona,Santiago Bernabéu Stadium
2,3,1930-01-26,9,Barcelona,Real Madrid,1–4 (0–3),Bestit (63),"Rubio (10, 37), F. López (17), Lazcano (71)",1,4,1,4,Real Madrid,Camp Nou
3,4,1930-03-30,18,Real Madrid,Barcelona,5–1 (3–0),"Rubio (5, 23), Lazcano (42, 68, 72)",Goiburu (84),5,1,1,5,Real Madrid,Santiago Bernabéu Stadium
4,5,1931-02-01,9,Real Madrid,Barcelona,0–0,,,0,0,0,0,Draw,Santiago Bernabéu Stadium


### **Scorers**


In [20]:
def scorers(df):
    df["Barcelona Scorers"] = df.apply(lambda row: row["Goals (home)"] if row["Home team"] == "Barcelona" else row["Goals (away)"], axis=1)
    df["Real Madrid Scorers"] = df.apply(lambda row: row["Goals (away)"] if row["Home team"] == "Barcelona" else row["Goals (home)"], axis=1)
    return df

In [None]:
df = scorers(df)

In [188]:
df.head()

Unnamed: 0,No.,Date,Matchweek,Home team,Away team,Score (FT/HT),Goals (home),Goals (away),Home Goals,Away Goals,Barcelona Goals,Real Madrid Goals,Winner,Venue,Barcelona Scorers,Real Madrid Scorers,Real_Madrid_Own_Goals,Barcelona_Own_Goals,Barcelona Scorers Clean,Real Madrid Scorers Clean
0,1,1929-02-17,2,Barcelona,Real Madrid,1–2 (0–1),Parera (70),"Morera (10, 55)",1,2,1,2,Real Madrid,Camp Nou,Parera (70),"Morera (10), Morera (55)",,,Parera,"Morera, Morera"
1,2,1929-05-09,11,Real Madrid,Barcelona,0–1 (0–0),,Sastre (83),0,1,1,0,Barcelona,Santiago Bernabéu Stadium,Sastre (83),,,,Sastre,
2,3,1930-01-26,9,Barcelona,Real Madrid,1–4 (0–3),Bestit (63),"Rubio (10, 37), F. López (17), Lazcano (71)",1,4,1,4,Real Madrid,Camp Nou,Bestit (63),"Rubio (10), Rubio (37), López (17), Lazcano (71)",,,Bestit,"Rubio, Rubio, López, Lazcano"
3,4,1930-03-30,18,Real Madrid,Barcelona,5–1 (3–0),"Rubio (5, 23), Lazcano (42, 68, 72)",Goiburu (84),5,1,1,5,Real Madrid,Santiago Bernabéu Stadium,Goiburu (84),"Rubio (5), Rubio (23), Lazcano (42), Lazcano (...",,,Goiburu,"Rubio, Rubio, Lazcano, Lazcano, Lazcano"
4,5,1931-02-01,9,Real Madrid,Barcelona,0–0,,,0,0,0,0,Draw,Santiago Bernabéu Stadium,,,,,,


In [8]:
def extract_own_goals(row, team_col, scorers_col):
        own_goals = []
        if pd.notna(row[scorers_col]):
            scorers = str(row[scorers_col]).split(', ')
            own_goals = [scorer.split(' ')[0] for scorer in scorers if 'o.g.' in scorer]
        return ', '.join(own_goals) if own_goals else None

def split_multiple_goals(text):
    """
    Splits multiple goals by same player into separate entries.
    Example: "Lewandowski (54, 56)" -> ["Lewandowski (54)", "Lewandowski (56)"]
    """
    if pd.isna(text):
        return None
    
    results = []
    # Find patterns like "Player (min1, min2)" or "Player (min1, min2+extra)"
    matches = re.finditer(r'(\w+)\s*\(((?:\d+(?:\+\d+)?(?:,\s*)?)+)\)', text)
    
    for match in matches:
        player = match.group(1)
        minutes = match.group(2).split(',')
        for minute in minutes:
            minute = minute.strip()
            if minute:  # Skip empty strings
                results.append(f"{player} ({minute})")
    
    return results

def remove_minutes(text):
    """
    Removes minutes from player names.
    Example: "Christensen (6)" -> "Christensen"
    """
    if pd.isna(text):
        return None
    
    # Replace patterns like "(number)" or "(number+number)"
    return re.sub(r'\s*\(\d+(?:\+\d+)?\s*(?:o\.g\.)?\)', '', text)

# Example usage with a pandas DataFrame:
def transform_match_data(df):
    """
    Applies all transformations to the DataFrame
    """
    # Create new columns for own goals
    df['Real_Madrid_Own_Goals'] = df.apply(lambda x: extract_own_goals(x, 'Home team', 'Barcelona Scorers'), axis=1) 
    df['Barcelona_Own_Goals'] = df.apply(lambda x: extract_own_goals(x, 'Away team', 'Real Madrid Scorers'), axis=1)
    
    # Split multiple goals
    df['Barcelona Scorers'] = df['Barcelona Scorers'].apply(lambda x: ', '.join(split_multiple_goals(x)) if not pd.isna(x) else None)
    df['Real Madrid Scorers'] = df['Real Madrid Scorers'].apply(lambda x: ', '.join(split_multiple_goals(x)) if not pd.isna(x) else None)
    
    # Remove minutes
    df['Barcelona Scorers Clean'] = df['Barcelona Scorers'].apply(remove_minutes)
    df['Real Madrid Scorers Clean'] = df['Real Madrid Scorers'].apply(remove_minutes)
    
    return df

In [189]:
df = transform_match_data(df)

### **New Dataframe**


In [192]:
columns={
            "Barcelona Goals": "Barcelona_Goals", 
            "Real Madrid Goals": "Real_Madrid_Goals",
            "Barcelona Scorers Clean" : "Barcelona_Scorers",
            "Real Madrid Scorers Clean": "Real_Madrid_Scorers"
         }
new_df = df.filter(items=["Date", "Venue", "Barcelona Goals", "Real Madrid Goals", "Winner", "Real_Madrid_Own_Goals", "Barcelona_Own_Goals","Barcelona Scorers Clean", "Real Madrid Scorers Clean"]).rename(columns)

In [193]:
new_df.head()

Unnamed: 0,Date,Venue,Barcelona Goals,Real Madrid Goals,Winner,Real_Madrid_Own_Goals,Barcelona_Own_Goals,Barcelona Scorers Clean,Real Madrid Scorers Clean
0,1929-02-17,Camp Nou,1,2,Real Madrid,,,Parera,"Morera, Morera"
1,1929-05-09,Santiago Bernabéu Stadium,1,0,Barcelona,,,Sastre,
2,1930-01-26,Camp Nou,1,4,Real Madrid,,,Bestit,"Rubio, Rubio, López, Lazcano"
3,1930-03-30,Santiago Bernabéu Stadium,1,5,Real Madrid,,,Goiburu,"Rubio, Rubio, Lazcano, Lazcano, Lazcano"
4,1931-02-01,Santiago Bernabéu Stadium,0,0,Draw,,,,


In [195]:
new_df.to_csv("Datasets/Cleaned_ds/la_liga.csv", index=False)

In [None]:
df = pd.read_csv("Datasets/Cleaned_ds/la_liga.csv")
df['League'] = 'La Liga'
df.to_csv("Datasets/Cleaned_ds/la_liga.csv", index=False)

# **UEFA**


#### **Import**


In [21]:
file = pd.read_csv("Datasets/el_clasico_matches_uefa.csv")
df = pd.DataFrame(file)

In [25]:
df.head()

Unnamed: 0,Season,Round,Round.1,Home team,Away team,Score (FT/HT),Goals (home),Goals (away),Barcelona Scorers,Real Madrid Scorers,Real_Madrid_Own_Goals,Barcelona_Own_Goals,Barcelona Scorers Clean,Real Madrid Scorers Clean
0,1959–60,Semi-finals,First leg,Real Madrid,Barcelona,3–1 (2–1),"Di Stéfano (17, 84), Puskás (28)",Martínez (37),Martínez (37),"Stéfano (17), Stéfano (84), Puskás (28)",,,Martínez,"Stéfano, Stéfano, Puskás"
1,1959–60,Semi-finals,Second leg,Barcelona,Real Madrid,1–3 (1–1),Kocsis (89),"Puskás (25, 75), Gento (68)",Kocsis (89),"Puskás (25), Puskás (75), Gento (68)",,,Kocsis,"Puskás, Puskás, Gento"
2,1960–61,First round,First leg,Real Madrid,Barcelona,2–2 (2–1),"Mateos (3), Gento (33)","Luis Suárez (27, 87 p.)",,"Mateos (3), Gento (33)",,,,"Mateos, Gento"
3,1960–61,First round,Second leg,Barcelona,Real Madrid,2–1 (1–0),"Vergés (33), Evaristo (82)",Canário (87),"Vergés (33), Evaristo (82)",Canário (87),,,"Vergés, Evaristo",Canário
4,2001–02,Semi-finals,First leg,Barcelona,Real Madrid,0–2 (0–0),,"Zidane (55), McManaman (90+2)",,"Zidane (55), McManaman (90+2)",,,,"Zidane, McManaman"


#### **Scorers**


In [23]:
df = scorers(df)
df = transform_match_data(df)

In [29]:
df.head()

Unnamed: 0,Season,Round,Round.1,Home team,Away team,Score (FT/HT),Goals (home),Goals (away),Barcelona Scorers,Real Madrid Scorers,Real_Madrid_Own_Goals,Barcelona_Own_Goals,Barcelona Scorers Clean,Real Madrid Scorers Clean
0,1959–60,Semi-finals,First leg,Real Madrid,Barcelona,3–1 (2–1),"Di Stéfano (17, 84), Puskás (28)",Martínez (37),Martínez (37),"Stéfano (17), Stéfano (84), Puskás (28)",,,Martínez,"Stéfano, Stéfano, Puskás"
1,1959–60,Semi-finals,Second leg,Barcelona,Real Madrid,1–3 (1–1),Kocsis (89),"Puskás (25, 75), Gento (68)",Kocsis (89),"Puskás (25), Puskás (75), Gento (68)",,,Kocsis,"Puskás, Puskás, Gento"
2,1960–61,First round,First leg,Real Madrid,Barcelona,2–2 (2–1),"Mateos (3), Gento (33)","Luis Suárez (27, 87 p.)",,"Mateos (3), Gento (33)",,,,"Mateos, Gento"
3,1960–61,First round,Second leg,Barcelona,Real Madrid,2–1 (1–0),"Vergés (33), Evaristo (82)",Canário (87),"Vergés (33), Evaristo (82)",Canário (87),,,"Vergés, Evaristo",Canário
4,2001–02,Semi-finals,First leg,Barcelona,Real Madrid,0–2 (0–0),,"Zidane (55), McManaman (90+2)",,"Zidane (55), McManaman (90+2)",,,,"Zidane, McManaman"


#### **Goals**


In [40]:
print(df.columns.tolist())

['Season', 'Round', 'Round.1', 'Home team', 'Away team', 'Score\xa0(FT/HT)', 'Goals (home)', 'Goals (away)', 'Barcelona Scorers', 'Real Madrid Scorers', 'Real_Madrid_Own_Goals', 'Barcelona_Own_Goals', 'Barcelona Scorers Clean', 'Real Madrid Scorers Clean', 'Home Goals', 'Away Goals', 'Barcelona Goals', 'Real Madrid Goals']


In [103]:
def goals(df):
    df["Home Goals"] = df["Score\xa0(FT/HT)"].str.extract(r"(\d+)–(\d+)")[0].astype(int)
    df["Away Goals"] = df["Score\xa0(FT/HT)"].str.extract(r"(\d+)–(\d+)")[1].astype(int)
    df["Barcelona Goals"] = df.apply(lambda row: row["Home Goals"] if row["Home team"] == "Barcelona" else row["Away Goals"], axis=1)
    df["Real Madrid Goals"] = df.apply(lambda row: row["Away Goals"] if row["Home team"] == "Barcelona" else row["Home Goals"], axis=1)
    return df

In [38]:
df = goals(df)

#### **Winner**


In [43]:
df["Winner"] = df.apply(determine_winner, axis=1)

#### **Venue Mapping**


In [None]:
df["Venue"] = df["Home team"].map(venue_map)

#### **New DF**


In [52]:
columns={
            "Season": "Date",
            "Barcelona Goals": "Barcelona_Goals", 
            "Real Madrid Goals": "Real_Madrid_Goals",
            "Barcelona Scorers Clean" : "Barcelona_Scorers",
            "Real Madrid Scorers Clean": "Real_Madrid_Scorers"
         }
new_df = df.filter(items=["Season", "Venue", "Barcelona Goals", "Real Madrid Goals", "Winner", "Real_Madrid_Own_Goals", "Barcelona_Own_Goals","Barcelona Scorers Clean", "Real Madrid Scorers Clean"]).rename(columns = columns)

In [53]:
new_df.head()

Unnamed: 0,Date,Venue,Barcelona_Goals,Real_Madrid_Goals,Winner,Real_Madrid_Own_Goals,Barcelona_Own_Goals,Barcelona_Scorers,Real_Madrid_Scorers
0,1959–60,Santiago Bernabéu Stadium,1,3,Real Madrid,,,Martínez,"Stéfano, Stéfano, Puskás"
1,1959–60,Camp Nou,1,3,Real Madrid,,,Kocsis,"Puskás, Puskás, Gento"
2,1960–61,Santiago Bernabéu Stadium,2,2,Draw,,,,"Mateos, Gento"
3,1960–61,Camp Nou,2,1,Barcelona,,,"Vergés, Evaristo",Canário
4,2001–02,Camp Nou,0,2,Real Madrid,,,,"Zidane, McManaman"


In [54]:
new_df.to_csv("Datasets/Cleaned_ds/uefa.csv", index=False)

In [96]:
df = pd.read_csv("Datasets/Cleaned_ds/uefa.csv")
df['League'] = 'UEFA Championship'
df.to_csv("Datasets/Cleaned_ds/uefa.csv", index=False)

# **COPA DEL REY**


In [55]:
file = pd.read_csv("Datasets/el_clasico_matches_copa_del_rey.csv")
df = pd.DataFrame(file)

In [56]:
df.head()

Unnamed: 0,Season,Round,Round.1,Home team,Away team,Score (FT/HT),Goals (home),Goals (away)
0,1916,Semi-finals,First leg,Barcelona,Real Madrid,2–1 (1–1),"Alcántara (39), Martínez (85)",Petit (17)
1,1916,Semi-finals,Second leg,Real Madrid,Barcelona,4–1 (2–1),"Bernabéu (35 p., 40, 60), Petit (80)",Martínez (20)
2,1916,Semi-finals,1st (R),Real Madrid,Barcelona,6–6 (a.e.t),"Belaunde (2, 55, 87), Bernabéu (23, 98, 118 p.)","Alcántara (15, 30, 102), Mallorquí (67), Bau (..."
3,1916,Semi-finals,2nd (R),Real Madrid,Barcelona,4–2 (a.e.t),"Bernabéu (25), Zabalo (85), Aranguren (100, 108)","Martínez (11, 38)"
4,1926,Quarter-finals,First leg,Real Madrid,Barcelona,1–5 (0–3),Monjardín (47),"Samitier (19, 26, 43, 64), Piera (79)"


In [57]:
df = scorers(df)
df = transform_match_data(df)

In [59]:
df = goals(df)

In [60]:
df["Winner"] = df.apply(determine_winner, axis=1)

In [61]:
df["Venue"] = df["Home team"].map(venue_map)

In [62]:
columns={
            "Season": "Date",
            "Barcelona Goals": "Barcelona_Goals", 
            "Real Madrid Goals": "Real_Madrid_Goals",
            "Barcelona Scorers Clean" : "Barcelona_Scorers",
            "Real Madrid Scorers Clean": "Real_Madrid_Scorers"
         }
new_df = df.filter(items=["Season", "Venue", "Barcelona Goals", "Real Madrid Goals", "Winner", "Real_Madrid_Own_Goals", "Barcelona_Own_Goals","Barcelona Scorers Clean", "Real Madrid Scorers Clean"]).rename(columns = columns)

In [64]:
new_df.head()

Unnamed: 0,Date,Venue,Barcelona_Goals,Real_Madrid_Goals,Winner,Real_Madrid_Own_Goals,Barcelona_Own_Goals,Barcelona_Scorers,Real_Madrid_Scorers
0,1916,Camp Nou,2,1,Barcelona,,,"Alcántara, Martínez",Petit
1,1916,Santiago Bernabéu Stadium,1,4,Real Madrid,,,Martínez,Petit
2,1916,Santiago Bernabéu Stadium,6,6,Draw,,,"Alcántara, Alcántara, Alcántara, Mallorquí, Ba...","Belaunde, Belaunde, Belaunde"
3,1916,Santiago Bernabéu Stadium,2,4,Real Madrid,,,"Martínez, Martínez","Bernabéu, Zabalo, Aranguren, Aranguren"
4,1926,Santiago Bernabéu Stadium,5,1,Barcelona,,,"Samitier, Samitier, Samitier, Samitier, Piera",Monjardín


In [65]:
new_df.to_csv("Datasets/Cleaned_ds/copa_del_rey.csv", index=False)

In [97]:
df = pd.read_csv("Datasets/Cleaned_ds/copa_del_rey.csv")
df['League'] = 'Copa Del Rey'
df.to_csv("Datasets/Cleaned_ds/copa_del_rey.csv", index=False)

# **COPA DE LA LIGA**


In [108]:
file = pd.read_csv("Datasets/el_clasico_matches_copa_de_la_liga.csv")
df = pd.DataFrame(file)

In [109]:
df.head()

Unnamed: 0,Season,Round,Round.1,Home team,Away team,Score (FT/HT),Goals (home),Goals (away)
0,1982–83,Final,First leg,Real Madrid,Barcelona,2–2 (0–0),"Del Bosque (62), Juanito (68 p.)","Carrasco (53), Maradona (57)"
1,1982–83,Final,Second leg,Barcelona,Real Madrid,2–1 (2–0),"Maradona (19 p.), Alexanko (25)",Santillana (80)
2,1984–85,Quarter-finals,First leg,Barcelona,Real Madrid,2–2 (2–0),"Clos (40), Marcos (44)","Valdano (67), Juanito (75)"
3,1984–85,Quarter-finals,Second leg,Real Madrid,Barcelona,1–1 (0–0) (4–1 p.),Valdano (83),Moratalla (57)
4,1985–86,Second round,First leg,Barcelona,Real Madrid,2–2 (1–1),"Clos (24), Archibald (50)","Pardeza (36), Cholo (52)"


In [110]:
df = scorers(df)
df = transform_match_data(df)
df = goals(df)
df["Winner"] = df.apply(determine_winner, axis=1)
df["Venue"] = df["Home team"].map(venue_map)

In [111]:
columns={
            "Season": "Date",
            "Barcelona Goals": "Barcelona_Goals", 
            "Real Madrid Goals": "Real_Madrid_Goals",
            "Barcelona Scorers Clean" : "Barcelona_Scorers",
            "Real Madrid Scorers Clean": "Real_Madrid_Scorers"
         }
new_df = df.filter(items=["Season", "Venue", "Barcelona Goals", "Real Madrid Goals", "Winner", "Real_Madrid_Own_Goals", "Barcelona_Own_Goals","Barcelona Scorers Clean", "Real Madrid Scorers Clean"]).rename(columns = columns)

In [72]:
new_df.to_csv("Datasets/Cleaned_ds/copa_de_la_liga.csv", index=False)

In [114]:
new_df['League'] = 'Copa De La Liga'
new_df.to_csv("Datasets/Cleaned_ds/copa_de_la_liga.csv", index=False)

# **SUPERCOPA**


In [73]:
file = pd.read_csv("Datasets/el_clasico_matches_supercopa.csv")
df = pd.DataFrame(file)

In [74]:
df.head()

Unnamed: 0,Season,Round,Home team,Away team,Score (FT/HT),Goals (home),Goals (away)
0,1988–89,First leg,Real Madrid,Barcelona,2–0 (0–0),"Míchel (51), Hugo Sánchez (78)",
1,1988–89,Second leg,Barcelona,Real Madrid,2–1 (1–1),"Bakero (37, 78)",Butragueño (15)
2,1990–91,First leg,Barcelona,Real Madrid,0–1 (0–0),,Míchel (55)
3,1990–91,Second leg,Real Madrid,Barcelona,4–1 (2–1),"Butragueño (21, 44), Hugo Sánchez (56), Aragón...",Goikoetxea (20)
4,1993–94,First leg,Real Madrid,Barcelona,3–1 (1–1),"Alfonso (33, 85), Zamorano (55)",Stoichkov (15)


In [75]:
df = scorers(df)
df = transform_match_data(df)
df = goals(df)
df["Winner"] = df.apply(determine_winner, axis=1)
df["Venue"] = df["Home team"].map(venue_map)

In [77]:
columns={
            "Season": "Date",
            "Barcelona Goals": "Barcelona_Goals", 
            "Real Madrid Goals": "Real_Madrid_Goals",
            "Barcelona Scorers Clean" : "Barcelona_Scorers",
            "Real Madrid Scorers Clean": "Real_Madrid_Scorers"
         }
new_df = df.filter(items=["Season", "Venue", "Barcelona Goals", "Real Madrid Goals", "Winner", "Real_Madrid_Own_Goals", "Barcelona_Own_Goals","Barcelona Scorers Clean", "Real Madrid Scorers Clean"]).rename(columns = columns)

In [79]:
new_df.to_csv("Datasets/Cleaned_ds/supercopa.csv", index=False)

In [99]:
df = pd.read_csv("Datasets/Cleaned_ds/supercopa.csv")
df['League'] = 'Supercopa'
df.to_csv("Datasets/Cleaned_ds/supercopa.csv", index=False)

# **ALL GAMES**


In [None]:
import pandas as pd
import os

def combine_csv_files(directory_path, output_filename):
    # List to store individual dataframes
    dfs = []
    # Get all CSV files in the directory
    csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]
    # Read each CSV file and append to list
    for csv_file in csv_files:
        file_path = os.path.join(directory_path, csv_file)
        # Read CSV and add a column to identify source file
        df = pd.read_csv(file_path)
        df['source_file'] = csv_file.replace('.csv', '')
        dfs.append(df)
        print(f"Processed: {csv_file}")
    # Combine all dataframes
    combined_df = pd.concat(dfs, ignore_index=True)
    # Save combined dataframe
    combined_df.to_csv(output_filename, index=False)
    print(f"\nCombined {len(csv_files)} files into {output_filename}")
    print(f"Total rows: {len(combined_df)}")

if __name__ == "__main__":
    directory = "Datasets/Cleaned_ds"
    output_file = "Datasets/combined_soccer_data.csv"
    combine_csv_files(directory, output_file)

Processed: copa_del_rey.csv
Processed: copa_de_la_liga.csv
Processed: coronacion.csv
Processed: la_liga.csv
Processed: supercopa.csv
Processed: uefa.csv

Combined 6 files into Datasets/combined_soccer_data.csv
Total rows: 259
