## 1. Imports y loggings

In [2]:
import sqlite3
import pandas as pd
import logging
import os

#Crear carpetas de logs si no existe
os.makedirs("logs", exist_ok = True)

#Configurar logging
logging.basicConfig(
    filename = 'logs/proyecto.log',
    level = logging.INFO,
    format = '%(asctime)s - %(levelname)s - %(message)s')

## 2. Extract

In [4]:
def extract_match_data ():
    try: 
        conn = sqlite3.connect ("database.sqlite")
        logging.info ("Conexión establecida con la base de datos.")

        query = """
        SELECT 
            m.match_api_id, 
            m.date, 
            m.home_team_goal, 
            m.away_team_goal,
            th.team_long_name AS home_team_name,
            ta.team_long_name AS away_team_name
        FROM Match m
        LEFT JOIN Team th ON m.home_team_api_id = th.team_api_id
        LEFT JOIN Team ta ON m.away_team_api_id = ta.team_api_id
        """

        df = pd.read_sql_query (query, conn)
        conn.close()
        logging.info(f"Extracción exitosa. Registros obtenidos: {len (df)}")
        return df

    except Exception as e:
        logging.error (f"Error en la extracción de datos {e}")
        return pd.DataFrame () 

#Probar la conexión
if __name__ == "__main__":
    df = extract_match_data ()
    print (df.head())


   match_api_id                 date  home_team_goal  away_team_goal  \
0        492473  2008-08-17 00:00:00               1               1   
1        492474  2008-08-16 00:00:00               0               0   
2        492475  2008-08-16 00:00:00               0               3   
3        492476  2008-08-17 00:00:00               5               0   
4        492477  2008-08-16 00:00:00               1               3   

      home_team_name     away_team_name  
0           KRC Genk       Beerschot AC  
1   SV Zulte-Waregem   Sporting Lokeren  
2  KSV Cercle Brugge     RSC Anderlecht  
3           KAA Gent          RAEC Mons  
4      FCV Dender EH  Standard de Liège  


## 3. Transformación

In [6]:
def transform_data(df):
    try:
        #Asegurarse que la columna 'date' sea de tipo datetime
        df['date'] = pd.to_datetime(df['date'])

        #Filtrar los datos donde Barcelona jugó como visitante o local
        df = df[
            (df['home_team_name'].str.contains("Barcelona", na=False)) |
            (df['away_team_name'].str.contains("Barcelona", na=False))
        ]

        #Filtrar solo temporada 2008-2009
        df = df[(df['date'] >= '2008-07-01') & (df['date'] <= '2009-06-30')]

        # Eliminar registros nullos
        df = df.dropna ()

        #Control de calidad
        if df.empty:
            logging.warning("⚠️ El dataframe resultante esta vacio luego de la validación.")
        else:
            logging.info("f✅ Tranfsoramción completada. Registros finales: {len(df)}")

        return df

        logging.info (f"Transformación completada. Registros finales: {len (df)}")
        return df

    except Exception as e:
        logging.error (f"Error durante la transformación: {e}")
        return df

In [7]:
df = extract_match_data ()
df_transformado = transform_data(df)
print (df_transformado.head())

       match_api_id       date  home_team_goal  away_team_goal home_team_name  \
21520        530086 2008-08-31               1               0    CD Numancia   
21534        530240 2008-11-08               6               0   FC Barcelona   
21546        530252 2008-11-16               0               2  RC Recreativo   
21549        530255 2008-11-23               1               1   FC Barcelona   
21563        530329 2008-11-29               0               3     Sevilla FC   

        away_team_name  
21520     FC Barcelona  
21534  Real Valladolid  
21546     FC Barcelona  
21549        Getafe CF  
21563     FC Barcelona  


## 4. Load

In [9]:
def load_data (df, output_path = "data/barcelona_2008_2009.csv"):
    try: 
        #Crear la carpeta 'data' si no existe
        os.makedirs ("data", exist_ok = True)

        #Guardar dataframe como archivo CSV
        df.to_csv (output_path, index = False)
        logging.info (f"Archivo CSV guardado exitosamente en: {output_path}")
        print (f"✅ Archivo guardado en: {output_path}")

    except Exception as e:
        logging.error (f"Error al guardar archivo CSV: {e}")
        print ("❌ Ocurrió un error al guardar el archivo.")

In [10]:
if __name__ == "__main__":
    df_raw = extract_match_data()
    df_clean = transform_data(df_raw)
    load_data(df_clean)

✅ Archivo guardado en: data/barcelona_2008_2009.csv


In [1]:
import pandas as pd

df = pd.read_csv("data/barcelona_2008_2009.csv")
print(df.shape)
print(df.head())

(38, 6)
   match_api_id        date  home_team_goal  away_team_goal home_team_name  \
0        530086  2008-08-31               1               0    CD Numancia   
1        530240  2008-11-08               6               0   FC Barcelona   
2        530252  2008-11-16               0               2  RC Recreativo   
3        530255  2008-11-23               1               1   FC Barcelona   
4        530329  2008-11-29               0               3     Sevilla FC   

    away_team_name  
0     FC Barcelona  
1  Real Valladolid  
2     FC Barcelona  
3        Getafe CF  
4     FC Barcelona  
