In [8]:
import pandas as pd
import numpy as np
import dateparser
import re
from datetime import datetime, timedelta
import os
import math

In [9]:
def findBetween(datestr, start, end):
    # Simple find string function
    pattern = re.escape(start) + r'(.*?)' + re.escape(end)
    matches = re.findall(pattern, datestr)
    return matches[0]
def extractDates(datestr):
    try:
        # Second date is full
        second_date = dateparser.parse(datestr.split('al ')[1])
        
        # First date day or day+month
        first_day = findBetween(datestr, 'Semana del ', ' al')
        first_date = None
        
        # Dates comes in two formats
        if not first_day.isnumeric():
            # Example: Semana del 27 de junio al 1 de julio 2022
            first_day, first_month = first_day.split(' de ')
            first_month = dateparser.parse(f"{first_day} de {first_month} de {second_date.year}").month
            first_date = second_date.replace(day=int(first_day), month=first_month)
            
            if (second_date - first_date) < timedelta(days=0):
                first_date = first_date.replace(year=first_date.year - 1)
        else:
            # Example: Semana del 09 al 13 de marzo de 2015
            first_day = int(first_day)
            first_date = second_date.replace(day=first_day)
        return [first_date, second_date]
    except Exception as e:
        print('Extract Dates: ', datestr, e)
def formatDateStr(datestr):
    # Dates in filenames can come in two formats
    try:
        new_date = dateparser.parse(datestr, date_formats=['%Y%m%d'])
        if new_date == None:
            new_date = dateparser.parse(datestr, date_formats=['%d%m%Y'])
            return new_date
        else:
            return new_date
    except Exception as e:
        print(e, datestr)
        return None

In [10]:
# Read all valid files that has date
path_data = './data/'
excel_files = os.listdir(path_data)
selected_excels = []
target_date = datetime(2018,9,21)

# Format date, and create a list
for excel_file in excel_files:
    pattern = r'(\d+)\.xlsx'
    match = re.search(pattern, excel_file)
    if match:
        parsed_date = formatDateStr(match.group(1))
        if parsed_date > target_date:
            selected_excels.append({'date':parsed_date, 'file': excel_file})
    else:
        continue
    continue
    
# Sort list by date
sorted_data = sorted(selected_excels, key=lambda x: x['date'])

In [11]:
def readAllFiles(files_paths):  
    result_df = pd.DataFrame()
    path_data = './data/'
    
    for data in sorted_data:
        df_sheets = pd.read_excel(path_data + data['file'], sheet_name=None)
        sheet_names = list(df_sheets.keys())[2:] #Remove 'Portada...' and 'Presentacion'
        #print(data['file'], sheet_names)
        
        for product in sheet_names:
            # Read excel file
            df = pd.read_excel(path_data + data['file'], sheet_name=product)
            # Extract dates
            start_date, end_date = extractDates(str(df.iloc[1][0]))
            # Delete empty space
            df.drop(df[df[df.columns[1]].isnull() == True].index, axis=0, inplace=True)
            # Get columns
            df.columns = np.array(df.iloc[0])
            # Rename column
            df.rename(columns={'Unidad de\ncomercialización ': 'Unidad'}, inplace=True)
            # Resets index
            df = df.reset_index(drop=True)
            # Separate price from volume data
            # Price and Volume are mirrored so they should always be a pair number
            split_index = int(df[df['Mercado'] == 'Mercado'].index[int(len(df[df['Mercado'] == 'Mercado'].index)/2)])
            
            price_df = df.iloc[:split_index]
            volume_df = df.iloc[split_index:]
            
            # Sometimes Friday values are 0 and day range of the week is from monday-tuesday
            # I thought this could happen with Monday as well but it didn't
            # Anyways I take start_date and calculate the rest starting from that
            if (end_date - start_date) < timedelta(days=4):
                if 'Viernes' not in price_df.columns:
                    price_df.insert(loc=len(price_df.columns), column='Viernes', value=0.0)
                    volume_df.insert(loc=len(volume_df.columns), column='Viernes', value=0.0)
                end_date = end_date + timedelta(days=1)
            
            # Melt Week days columns into a single column "Dia" and it's values into another column "Precio" and "Volumen"
            price_df = pd.melt(price_df, id_vars=['Variedad', 'Mercado', 'Unidad'], var_name='Dia', value_name='Precio')
            volume_df = pd.melt(volume_df, id_vars=['Variedad', 'Mercado', 'Unidad'], var_name='Dia', value_name='Volumen')

            # Function to calculate the date for each weekday based on the start date
            def calculate_date(row):
                weekday_to_offset = {
                    'Lunes': 0,
                    'Martes': 1,
                    'Miércoles': 2,
                    'Jueves': 3,
                    'Viernes': 4
                }
                offset = weekday_to_offset[row['Dia']]
                return start_date + pd.DateOffset(days=offset)

            # Apply the function to create the 'Date' column
            price_df['Fecha'] = price_df.apply(calculate_date, axis=1)
            volume_df['Fecha'] = volume_df.apply(calculate_date, axis=1)

            price_df['Unidad'] = volume_df['Unidad']

            merged_df = pd.merge(price_df, volume_df, on=['Variedad', 'Mercado', 'Dia', 'Fecha', 'Unidad'])    
            merged_df['Producto'] = product
            
            # Ready to concat
            result_df = pd.concat([result_df, merged_df])
            #print(f" {product} OK,", end='')
        result_df = result_df.reset_index(drop=True)
    return result_df

In [7]:
df = readAllFiles(sorted_data)

Boletin_Semanal_Precios_Mayoristas_20220722.xlsx Pera
Boletin_Semanal_Precios_Mayoristas_20230113.xlsx Pera
Boletin_Semanal_Precios_Mayoristas_20230120.xlsx Pera
Boletin_Semanal_Precios_Mayoristas_20230127.xlsx Pera
Boletin_Semanal_Precios_Mayoristas_20230310.xlsx Pera
Boletin_Semanal_Precios_Mayoristas_20230317.xlsx Pera
Boletin_Semanal_Precios_Mayoristas_20230331.xlsx Pera
Boletin_Semanal_Precios_Mayoristas_20230421.xlsx Lechuga
Boletin_Semanal_Precios_Mayoristas_20230623.xlsx Pera
Boletin_Semanal_Precios_Mayoristas_20230630.xlsx Pera


In [25]:
df = df.reset_index(drop=True)

In [23]:
df['Total'] = df['Precio'] * df['Volumen']

In [35]:
# A questions arises: Why is there a volume in the same datapoint where price is 0?
df[(df['Precio'] == 0) & (df['Volumen'] > 0)]

TypeError: 'NoneType' object is not subscriptable