# SIPSA Project
##### Developed by Sébastien Lozano-Forero






In [33]:
import sys
from pathlib import Path

# Add the 'utils' directory to the system path so we can import from it
sys.path.append(str(Path().resolve().parent / 'utils'))


import pandas as pd
from aws_utils import DataPipelineManager

## Exploratory Data Analysis

In [16]:
run = DataPipelineManager()

In [63]:
query = """
    SELECT * FROM product_prices
    WHERE mercado = 'corabastos'
"""

df = run.queries_on_rds(query = query)

In [65]:
df[['semana_no','anho']].drop_duplicates().sample(10)

Unnamed: 0,producto,ciudad,precio_minimo,precio_maximo,precio_medio,tendencia,categoria,mercado,semana_no,anho


In [43]:
# Create a date column using the first day of the year and adding the week number as a timedelta
df['date'] = pd.to_datetime(df['anho'].astype(str) + '-1-1') + pd.to_timedelta(df['semana_no'].sub(1).mul(7), unit='D')

# Sort by the newly created date column
df_sorted = df.sort_values(by='date')

# Show the sorted data
df_sorted


Unnamed: 0,producto,ciudad,precio_minimo,precio_maximo,precio_medio,tendencia,categoria,mercado,semana_no,anho,date
22189,acelga,bogota,300,333,303,--,verduras_hortalizas,corabastos,1,2012,2012-01-01
23085,papa_unica,bogota,413,500,463,-,tuberculos_raices_platanos,corabastos,1,2012,2012-01-01
23084,papa_suprema,bogota,520,700,587,--,tuberculos_raices_platanos,corabastos,1,2012,2012-01-01
23083,papa_parda_pastusa,bogota,813,910,876,+,tuberculos_raices_platanos,corabastos,1,2012,2012-01-01
23082,papa_criolla_sucia,bogota,700,867,779,--,tuberculos_raices_platanos,corabastos,1,2012,2012-01-01
...,...,...,...,...,...,...,...,...,...,...,...
14972,azucar_morena,bogota,4320,4500,4400,-,productos_procesados,corabastos,54,2024,2025-01-06
14971,avena_molida,bogota,9203,9877,9407,-,productos_procesados,corabastos,54,2024,2025-01-06
14970,avena_en_hojuelas,bogota,9203,9877,9407,-,productos_procesados,corabastos,54,2024,2025-01-06
14981,gelatina,bogota,37405,38571,38024,-,productos_procesados,corabastos,54,2024,2025-01-06


In [62]:
here = list(df.loc[df['anho']==2024,'semana_no'].unique())
here.sort()
here

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 35,
 36,
 37,
 38,
 39,
 40,
 53,
 54]

## Util functions

### DataPipelineManager

In [1]:
# import boto3
# import os
# import pandas as pd
# from dotenv import load_dotenv
# from io import BytesIO
# from sqlalchemy import create_engine
# from botocore.exceptions import ClientError

# class DataPipelineManager:
#     def __init__(self):
#         """
#         Initializes the DataPipelineManager class, loading environment variables and setting up S3 and RDS connections.
#         """
#         load_dotenv()

#         # Initialize AWS credentials
#         self.aws_access_key_id = os.environ['aws_access_key_id']
#         self.aws_secret_access_key = os.environ['aws_secret_access_key']
#         self.bucket_name = os.environ['BUCKET_NAME']
        
#         # Initialize S3 resource
#         self.s3 = boto3.resource('s3',
#                                 aws_access_key_id=self.aws_access_key_id, 
#                                 aws_secret_access_key=self.aws_secret_access_key)

#         # Initialize RDS database credentials
#         self.db_user = os.environ['db_user']
#         self.db_pass = os.environ['db_pass']
#         self.db_host = os.environ['db_host']
#         self.db_port = os.environ['db_port']
#         self.db_name = os.environ['db_name']
#         self.engine = create_engine(f'postgresql://{self.db_user}:{self.db_pass}@{self.db_host}:{self.db_port}/{self.db_name}')
    
#     def queries_on_rds(self, query: str) -> pd.DataFrame:
#         """
#         Executes a SQL query on the RDS PostgreSQL database and returns the result as a pandas DataFrame.

#         Args:
#             query (str): The SQL query to execute.

#         Returns:
#             pd.DataFrame: The result of the SQL query.
#         """
#         with self.engine.begin() as conn:
#             dataframe = pd.read_sql(sql=query, con=conn)
#         return dataframe

#     def get_files_tracker_from_s3(self, file_name: str = 'files_tracker.csv') -> pd.DataFrame:
#         """
#         Retrieves the 'files_tracker.csv' from the S3 bucket and returns its content as a pandas DataFrame.

#         Args:
#             file_name (str): The name of the file to retrieve from S3.

#         Returns:
#             pd.DataFrame: The content of the file as a pandas DataFrame.
#         """
#         try:
#             obj = self.s3.Object(self.bucket_name, file_name)
#             csv_content = obj.get()['Body'].read()
#             df = pd.read_csv(BytesIO(csv_content))
#             return df
#         except ClientError as e:
#             print(f"Error retrieving file from S3: {e}")
#             return pd.DataFrame()

#     def log_files_manager(self, log_file: str = None, log_prefix: str = 'logs/') -> list:
#         """
#         Lists or retrieves log files from the S3 bucket. If a log file is provided, its content is loaded into a pandas DataFrame.

#         Args:
#             log_file (str, optional): The specific log file to retrieve. Defaults to None, in which case all log files are listed.
#             log_prefix (str, optional): The prefix where log files are stored. Defaults to 'logs/'.

#         Returns:
#             list: A list of log file names if log_file is None.
#             pd.DataFrame: The content of the log file as a pandas DataFrame if log_file is provided.
#         """
#         if log_file is None:
#             try:
#                 bucket = self.s3.Bucket(self.bucket_name)
#                 log_files = [obj.key for obj in bucket.objects.filter(Prefix=log_prefix) if obj.key.endswith('.log')]
#                 return log_files
#             except ClientError as e:
#                 print(f"Error listing log files from S3: {e}")
#                 return []
#         else:
#             try:
#                 obj = self.s3.Object(self.bucket_name, log_file)
#                 log_content = obj.get()['Body'].read().decode('utf-8')
#                 log_lines = log_content.splitlines()
#                 log_data = [line.split(' - ', maxsplit=2) for line in log_lines if len(line.split(' - ', maxsplit=2)) == 3]
#                 df = pd.DataFrame(log_data, columns=['timestamp', 'level', 'message'])
#                 return df
#             except ClientError as e:
#                 print(f"Error reading log file {log_file} from S3: {e}")
#                 return pd.DataFrame()

In [6]:
# Example Usage
manager = DataPipelineManager()

In [9]:
manager.get_files_tracker_from_s3()

Unnamed: 0,file,link,date_added,rds_load
0,week_39_anex-SIPSASemanal-21sep-27sep-2024.xlsx,https://www.dane.gov.co/files/operaciones/SIPS...,2024-10-02,yes
1,week_38_anex-SIPSASemanal-14sep-20sep-2024.xlsx,https://www.dane.gov.co/files/operaciones/SIPS...,2024-10-02,yes
2,week_37_anex-SIPSASemanal-7sep-13sep-2024.xlsx,https://www.dane.gov.co/files/operaciones/SIPS...,2024-10-02,yes
3,week_36_anex-SIPSASemanal-31ago-6sep-2024.xlsx,https://www.dane.gov.co/files/operaciones/SIPS...,2024-10-02,yes
4,week_35_anex-SIPSASemanal-24ago-30ago-2024.xlsx,https://www.dane.gov.co/files/operaciones/SIPS...,2024-10-02,yes
...,...,...,...,...
614,week_4_Anexo_Bol_Semanal_SIPSA_Diciembre_1_201...,https://www.dane.gov.co/files/investigaciones/...,2024-10-02,yes
615,week_3_Anexo_Bol_Semanal_SIPSA_Noviembre_24_20...,https://www.dane.gov.co/files/investigaciones/...,2024-10-02,yes
616,week_2_Anexo_Bol_Semanal_SIPSA_Noviembre_23_20...,https://www.dane.gov.co/files/investigaciones/...,2024-10-02,yes
617,week_1_Anexo_Bol_Semanal_SIPSA_Noviembre_16_20...,https://www.dane.gov.co/files/investigaciones/...,2024-10-02,yes


In [11]:
manager.log_files_manager()

['logs/sipsa_process_10_02_2024.log',
 'logs/sipsa_process_10_04_2024.log',
 'logs/sipsa_process_10_05_2024.log',
 'logs/sipsa_process_10_06_2024.log']