In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
def format_merge_data(df_ventas, df_detail):
    df_ventas['periodo'] = pd.to_datetime(df_ventas['periodo'], format='%Y%m')
    df_ventas.sort_values(by='periodo', inplace=True)
    print(df_ventas.shape)
    df_ventas.drop_duplicates(inplace=True)
    print(df_ventas.shape)
    print(df_detail.shape)
    df_detail.drop_duplicates(inplace=True)
    print(df_detail.shape)
    df_detail = df_detail[df_detail['product_id'] != 20230]
    df_detail = df_detail[df_detail['product_id'] != 20623]
    df = pd.merge(df_ventas, df_detail, how='left', on='product_id')
    assert df.shape[0] - df_ventas.shape[0] == 0, 'hay duplicados'
    
    columns_to_uppercase = ['cat1', 'cat2', 'cat3']
    df[columns_to_uppercase] = df[columns_to_uppercase].apply(lambda x: x.astype(str).str.upper())
    return df

def add_time_features(df):
    df = df.copy()
    df['periodo'] = pd.to_datetime(df['periodo'])
    df['month'] = df['periodo'].dt.month
    df['year'] = df['periodo'].dt.year
    df['day_of_week'] = df['periodo'].dt.dayofweek
    df['day_of_month'] = df['periodo'].dt.day
    df['week_of_year'] = df['periodo'].dt.isocalendar().week.astype(int)
    df['quarter'] = df['periodo'].dt.quarter
    df['sin_month'] = np.sin(2 * np.pi * df['month'] / 12)
    df['cos_month'] = np.cos(2 * np.pi * df['month'] / 12)
    df['sin_day_of_week'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['cos_day_of_week'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    return df

def create_lag_features(df, num_lags, column_name):
    df = df.copy()
    for i in range(1, num_lags + 1):
        df[f'{column_name}_lag{i}'] = df[column_name].shift(i)
        df[f'{column_name}_diff_lag{i}'] = df[column_name].diff(i)
    return df

def create_rolling_features(df, windows, column_name):
    df = df.copy()
    for window in windows:
        df[f'{column_name}_roll_mean_{window}'] = df[column_name].rolling(window=window).mean()
        df[f'{column_name}_roll_std_{window}'] = df[column_name].rolling(window=window).std()
        df[f'{column_name}_expanding_mean'] = df[column_name].expanding().mean()
        df[f'{column_name}_expanding_std'] = df[column_name].expanding().std()
    return df

def scale_data(df_x):
    df = df_x.copy()
    features_to_scale = df.select_dtypes(include=[np.number]).columns.difference(['tn'])
    scaler = StandardScaler()
    df[features_to_scale] = scaler.fit_transform(df[features_to_scale])
    return df

def split_shift(df_x):
    df = df_x.copy()
    df['tn'] = df['tn'].shift(-2)    
    train_set = df[~df['tn'].isna()]
    predict_set = df[df['tn'].isna()]
    return train_set, predict_set

def pipeline(df_x):
    df = df_x.copy()
    df = df.drop(['cat1','cat2','cat3','brand'], axis=1)
    df = create_lag_features(df, 12, 'tn')
    df = create_rolling_features(df, [3, 6, 12], 'tn')
    df = create_lag_features(df, 12, 'cust_request_qty')
    df = create_rolling_features(df, [3, 6, 12], 'cust_request_qty')
    df = df.drop(columns=['periodo'])
    df = df.fillna(0) ## ver estrategia aca
    df = scale_data(df)           
    return df

def predict_random_forest(df):
    fix_cust = df.copy()
    fix_cust = pipeline(fix_cust)   
    train_df, predict_df = split_shift(fix_cust)
    
    X = train_df.drop(columns=['tn'])
    y = train_df['tn']
    model = RandomForestRegressor(n_estimators=500, random_state=42)
    model.fit(X, y)

    predict_df = predict_df.drop(columns=['tn'])
    return model.predict(predict_df.tail(1))[0]

# Load data

In [3]:
path_data = '../labo3/data/'
df_ventas = pd.read_csv(path_data+'sell-in.txt', delimiter='\t', decimal='.') 
df_detail = pd.read_csv(path_data+'tb_productos.txt', delimiter='\t')  
df_product_to_predict = pd.read_csv(path_data+'productos_a_predecir.txt', delimiter='\t') 
df_stocks = pd.read_csv(path_data+'tb_stocks.txt', delimiter='\t') 

## Format data

In [4]:
df = format_merge_data(df_ventas, df_detail)
df = df[df['product_id'].isin(df_product_to_predict['product_id'].to_list())]

(2945818, 7)
(2945818, 7)
(1262, 6)
(1253, 6)


# Train loop

In [None]:
# final_dict = {}

# for prod_id in df['product_id'].unique():
#     temp_list = []    
#     for cust_id in tqdm(df['customer_id'].unique()):        
#         fix_cust = df[(df['customer_id'] == cust_id) & (df['product_id'] == prod_id)]
#         fix_cust = fix_cust.drop(columns=['customer_id','product_id'])        
#         if fix_cust.shape[0] == 0:
#             temp_list.append(0)
#         elif fix_cust.shape[0] < 6:
#             temp_list.append(fix_cust['tn'].mean())
#         else:            
#             temp_list.append(predict_random_forest(fix_cust))            
#         final_dict[prod_id] = sum(temp_list)  

In [None]:
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count

def process_product_ids(chunk):
    local_dict = {}
    for prod_id in chunk:
        temp_list = []
        for cust_id in df['customer_id'].unique():
            fix_cust = df[(df['customer_id'] == cust_id) & (df['product_id'] == prod_id)]
            fix_cust = fix_cust.drop(columns=['customer_id', 'product_id'])
            if fix_cust.shape[0] == 0:
                temp_list.append(0)
            elif fix_cust.shape[0] < 6:
                temp_list.append(fix_cust['tn'].mean())
            else:
                temp_list.append(predict_random_forest(fix_cust))
        local_dict[prod_id] = sum(temp_list)
    return local_dict

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def main(df):
    product_ids = df['product_id'].unique()
    product_chunks = list(chunks(product_ids, 10))  # Change 10 to another number if you want different chunk sizes

    final_dict = {}
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_product_ids, chunk) for chunk in product_chunks]
        for future in as_completed(futures):
            final_dict.update(future.result())

    return final_dict

# Assuming you have a DataFrame 'df' already loaded
final_results = main(df)