In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from math import sqrt
import warnings
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from multiprocessing import Pool, cpu_count
warnings.filterwarnings("ignore")


In [2]:
def format_data(df_ventas):
    df_ventas['periodo'] = pd.to_datetime(df_ventas['periodo'], format='%Y%m')
    df_ventas.sort_values(by='periodo', inplace=True)
    print(df_ventas.shape)
    df_ventas.drop_duplicates(inplace=True)
    print(df_ventas.shape)
    return df_ventas

# Load data

In [3]:
path_data = '../labo3/data/'
df_ventas = pd.read_csv(path_data+'sell-in.txt', delimiter='\t', decimal='.') 
df_detail = pd.read_csv(path_data+'tb_productos_descripcion.txt', delimiter='\t')  
df_product_to_predict = pd.read_csv(path_data+'productos_a_predecir.txt', delimiter='\t') 
df_stocks = pd.read_csv(path_data+'tb_stocks.txt', delimiter='\t') 

## Format data

In [4]:
df = format_data(df_ventas)
df = df[df['product_id'].isin(df_product_to_predict['product_id'].to_list())]

(2945818, 7)
(2945818, 7)


In [5]:
df = df[['periodo','customer_id','product_id','tn']]
df = df.groupby(['periodo', 'product_id'])['tn'].sum().reset_index()

## ajustando dataset

In [6]:
df = df[df['periodo'] >= '2018-01-01']

# Train loop

In [7]:
warnings.filterwarnings("ignore")

# multiprocessing

In [8]:
def walk_forward_validation(data, order):
    predictions = []
    train, test = data[:int(len(data) * 0.8)], data[int(len(data) * 0.8):]
    history = [x for x in train]
    for t in range(len(test)):
        model = ARIMA(history, order=order)
        model_fit = model.fit()
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    rmse = sqrt(mean_squared_error(test, predictions))
    return rmse


def optimize_arima(series, p_values, d_values, q_values):
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p, d, q)
                try:
                    rmse = walk_forward_validation(series, order)
                    if rmse < best_score:
                        best_score, best_cfg = rmse, order
                    #print('ARIMA%s RMSE=%.3f' % (order, rmse))
                except:
                    continue
    return best_cfg

def predict_arima(series):
    p_values = range(0, 3)  
    d_values = range(0, 3) 
    q_values = range(0, 3)  
    best_params = optimize_arima(series, p_values, d_values, q_values)
    if best_params is None:
        return series.mean()  # Fallback 
    model = ARIMA(series, order=best_params)
    model_fit = model.fit()
    return model_fit.forecast(steps=2).iloc[-1]

def process_products(product_ids, df):
    results = {}
    for prod_id in product_ids:
        temp_list = []
        fix_cust = df[df['product_id'] == prod_id]['tn']
        if fix_cust.empty:
            temp_list.append(0)
        elif len(fix_cust) < 6:
            temp_list.append(fix_cust.mean())
        else:
            temp_list.append(predict_arima(fix_cust))
        results[prod_id] = sum(temp_list)
    return results

def chunkify(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
if __name__ == '__main__':
    df['periodo'] = pd.to_datetime(df['periodo'])
    df.set_index('periodo', inplace=True)

    product_ids = df['product_id'].unique()
    num_cores = 96  
    chunk_size = len(product_ids) // num_cores
    chunks = list(chunkify(product_ids, chunk_size))

    with Pool(num_cores) as pool:
        results = pool.starmap(process_products, [(chunk, df) for chunk in chunks])
  
    final_dict = {}
    for result in results:
        final_dict.update(result)

In [None]:
pd.DataFrame.from_dict(final_dict,orient='index').reset_index().rename(columns={'index': 'product_id', 0:'tn'}).to_csv('arima_optimal_12_months.csv', index=False)