# Analyze Price Data

In [58]:
from datetime import datetime
import os
import pandas as pd
import numpy as np

# Get Dates

In [6]:
def get_dates(root_folder):
    files = os.listdir(root_folder)
    return sorted([datetime.strptime(file.split("_")[1].split(".")[0], '%d-%m-%Y') 
            for file in files if ".csv" in file])

In [7]:
root_folder = "raw_data"
dates = get_dates(root_folder)

# Get Data

In [8]:
def get_data(dates):
    data_list = [pd.read_csv("raw_data/data_{0}.csv".format(date.strftime('%d-%m-%Y'))) for date in dates]
    return pd.concat(data_list)

In [65]:
final_data = get_data(dates)

In [66]:
final_data.head()

Unnamed: 0,Producto,Categoría,Anterior,Último,Diferencia,Medida,Tendencia,Fecha
0,Cereales,Trigo panificable,"0,00 / 0,00",14700,14700,Eur / Tm.,,02-01-2006
1,Cereales,Cebada,"0,00 / 0,00",14500,14500,Eur / Tm.,,02-01-2006
2,Cereales,Avena,"0,00 / 0,00",14500,14500,Eur / Tm.,,02-01-2006
3,Cereales,Centeno,"0,00 / 0,00",13900,13900,Eur / Tm.,,02-01-2006
4,Cereales,Maíz,"0,00 / 0,00",14200,14200,Eur / Tm.,,02-01-2006


In [67]:
final_data[final_data["Categoría"]=="Cebada"].head()

Unnamed: 0,Producto,Categoría,Anterior,Último,Diferencia,Medida,Tendencia,Fecha
1,Cereales,Cebada,"0,00 / 0,00",14500,14500,Eur / Tm.,,02-01-2006
1,Cereales,Cebada,14500,14500,0,Eur / Tm.,,09-01-2006
1,Cereales,Cebada,14500,14500,0,Eur / Tm.,,16-01-2006
1,Cereales,Cebada,14500,14500,0,Eur / Tm.,,23-01-2006
1,Cereales,Cebada,14500,14400,-100,Eur / Tm.,,30-01-2006


In [68]:
final_data[final_data["Categoría"]=="Cebada"].tail()

Unnamed: 0,Producto,Categoría,Anterior,Último,Diferencia,Medida,Tendencia,Fecha
1,Cereales,Cebada,"154,00 / 0,00","155,00 / 0,00",100,Eur / Tm.,,23-07-2018
1,Cereales,Cebada,"155,00 / 0,00","159,00 / 0,00",400,Eur / Tm.,,30-07-2018
1,Cereales,Cebada,"159,00 / 0,00","165,00 / 0,00",600,Eur / Tm.,,06-08-2018
1,Cereales,Cebada,"165,00 / 0,00","175,00 / 0,00",1000,Eur / Tm.,,13-08-2018
1,Cereales,Cebada,"165,00 / 0,00","175,00 / 0,00",1000,Eur / Tm.,,13-08-2018


# Clean Data

In [69]:
def get_price(row, row_name):
    value = float(row[row_name].split("/")[0].strip().replace(",", "."))
    if value == 0.0:
        return np.nan
    else:
        return value

In [70]:
final_data["Anterior"] = final_data.apply(lambda row: get_price(row, "Anterior"), axis=1)

In [71]:
final_data["Último"] = final_data.apply(lambda row: get_price(row, "Último"), axis=1)

In [72]:
final_data["Diferencia"] = final_data.apply(lambda row: get_price(row, "Diferencia"), axis=1)

In [73]:
final_data[final_data["Categoría"]=="Cebada"].tail()

Unnamed: 0,Producto,Categoría,Anterior,Último,Diferencia,Medida,Tendencia,Fecha
1,Cereales,Cebada,154.0,155.0,1.0,Eur / Tm.,,23-07-2018
1,Cereales,Cebada,155.0,159.0,4.0,Eur / Tm.,,30-07-2018
1,Cereales,Cebada,159.0,165.0,6.0,Eur / Tm.,,06-08-2018
1,Cereales,Cebada,165.0,175.0,10.0,Eur / Tm.,,13-08-2018
1,Cereales,Cebada,165.0,175.0,10.0,Eur / Tm.,,13-08-2018


# Get Grouped Data

In [74]:
dict_df = {key:df for key, df in final_data.groupby("Categoría")}

# Impute Missing Data

In [86]:
for key in dict_df.keys():
    dict_df[key]["Anterior"] = dict_df[key]["Anterior"].interpolate(axis=0)
    dict_df[key]["Último"] = dict_df[key]["Último"].interpolate(axis=0)

# Export Weekly Data

In [89]:
writer_weekly = pd.ExcelWriter('weekly_data.xlsx', engine='xlsxwriter')

In [None]:
for key in dict_df.keys():
    sheet_name = key[:30]
    chars = '[]:*?/\"'
    for c in chars:
        sheet_name = sheet_name.replace(c, "")
    print(sheet_name)
    dict_df[key].to_excel(writer_weekly, sheet_name=sheet_name)

In [100]:
writer_weekly.save()