In [None]:
%%capture
!pip install -U scikit-learn==1.0

In [None]:
import numpy as np 
import pandas as pd
import zipfile
import os
from tqdm import tqdm
import gc
import time

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True
            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
    
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of:", round(start_mem_usg, 1), " MB", " reduced to: ", round(mem_usg, 1)," MB")
    return props, NAlist


def chunk_read_csv(fpath, chunksize=10000000):
    df = pd.DataFrame()
    for chunk in pd.read_csv(fpath,chunksize=chunksize):
        df = pd.concat([df, chunk])
    return df


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in tqdm(filenames):
        zipfile.ZipFile(os.path.join(dirname, filename)).extractall()
        locals()[filename[:-8]] = reduce_mem_usage(chunk_read_csv(os.path.join(".", filename[:-4])))[0]
        gc.collect()

In [None]:
TO_ENG_DICT = {
    "Semana": "week_number", # From Thursday to Wednesday
    "Agencia_ID": "sales_depot_id", 
    "Canal_ID": "sales_channel_id", 
    "Ruta_SAK": "route_id", # Several routes = Sales Depot
    "Cliente_ID": "client_id", 
    "NombreCliente": "client_name", 
    "Producto_ID": "product_id", 
    "NombreProducto": "product_name", 
    "Venta_uni_hoy": "sales_unit_this_week", # integer
    "Producto_ID": "product_id", 
    "NombreProducto": "product_name", 
    "Venta_uni_hoy": "sales_unit_this_week", # integer
    "Venta_hoy": "sales_this_week", # unit: pesos
    "Dev_uni_proxima": "returns_unit_next_week", # integer
    "Dev_proxima": "returns_next_week", # unit: pesos
    "Demanda_uni_equil": "adjusted_demand", # integer, target
    "Town": "town",
    "State": "state",
}

TARGET = "Demanda_uni_equil"

cliente_tabla = cliente_tabla.rename(columns=TO_ENG_DICT, copy=False)
producto_tabla = producto_tabla.rename(columns=TO_ENG_DICT, copy=False)
town_state = town_state.rename(columns=TO_ENG_DICT, copy=False)
train = train.rename(columns=TO_ENG_DICT, copy=False)
test = test.rename(columns=TO_ENG_DICT, copy=False)

In [None]:
display(cliente_tabla.head(2))
display(producto_tabla.head(2))
display(town_state.head(2))

In [None]:
display(sample_submission.head(2))

In [None]:
display(train.head(2))
display(test.head(2))

In [None]:
train.isnull().any().any(), test.isnull().any().any()

In [None]:
# def unique_in_test(train_vals, test_vals):
#     test_set = set(test_vals)
#     intersect_set = set(np.intersect1d(train_vals, test_vals))
#     diff_set = test_set - intersect_set
#     return len(list(diff_set)), round(len(list(diff_set)) / len(list(test_set)), 3)

# print(unique_in_test(train.sales_depot_id, test.sales_depot_id))
# print(unique_in_test(train.sales_channel_id, test.sales_channel_id))
# print(unique_in_test(train.route_id, test.route_id))
# print(unique_in_test(train.product_id, test.product_id))
# print(unique_in_test(train.client_id, test.client_id))

In [None]:
train.sort_values("week_number", inplace=True, ignore_index=True)
gc.collect(); time.sleep(2); gc.collect()

test.sort_values("id", inplace=True, ignore_index=True)
test.drop(columns=["id"], inplace=True)
gc.collect(); time.sleep(2); gc.collect()

# Feature Generation

In [None]:
# train = train.head(len(train) // 100)
# gc.collect()
# time.sleep(2)
# test = test.head(len(test) // 100)
# gc.collect()
# time.sleep(2)
# train.shape, test.shape

In [None]:
town_state.town = LabelEncoder().fit_transform(town_state.town.values)
town_state.state = LabelEncoder().fit_transform(town_state.state.values)
town_state = reduce_mem_usage(town_state)[0]
train = train.merge(town_state, on="sales_depot_id", copy=False)
test = test.merge(town_state, on="sales_depot_id", copy=False)
gc.collect(); time.sleep(2); gc.collect()

In [None]:
def calc_features(df, col):
    features_df = df.groupby(col)\
                .agg({"sales_unit_this_week": ["mean", "max"],
                      "sales_this_week": "mean"})
    features_df.columns = list(map("_".join, features_df.columns))
    features_df.columns = features_df.columns.values + "_{}".format(col)
    return features_df

def add_feature(X_train, X_test, col):
    features_df = reduce_mem_usage(calc_features(X_train, col))[0]
    return X_train.merge(features_df, on=col, copy=False, how="left"), X_test.merge(features_df, on=col, copy=False, how="left")

In [None]:
# train, test = add_feature(train, test, "sales_channel_id")
# gc.collect(); time.sleep(2); gc.collect()
# train, test = add_feature(train, test, "sales_depot_id")
# gc.collect(); time.sleep(2); gc.collect()
# train, test = add_feature(train, test, "town")
# gc.collect(); time.sleep(2); gc.collect()
# train, test = add_feature(train, test, "state")
# gc.collect(); time.sleep(2); gc.collect()
# train, test = add_feature(train, test, "route_id")
# gc.collect(); time.sleep(2); gc.collect()

In [None]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

## Cross-Validation Parameters Tuning

In [None]:
gc.collect()

In [None]:
X_train, X_test = train.head(int(len(train) * 0.2)), train.tail(len(train) - int(len(train) * 0.9))

In [None]:
y_train = X_train["adjusted_demand"]
X_train.drop(columns=["week_number", "sales_unit_this_week", "sales_this_week", "returns_unit_next_week", "returns_next_week", "adjusted_demand"], 
                     inplace=True)

y_test = X_test["adjusted_demand"]
X_test.drop(columns=["week_number"], inplace=True)
X_test.drop(columns=["sales_unit_this_week", "sales_this_week", "returns_unit_next_week", "returns_next_week", "adjusted_demand"], inplace=True)

In [None]:
len(X_train), len(X_test)

### Random Forest

In [None]:
model = RandomForestRegressor(n_estimators=1, 
                              max_depth=5, 
                              min_samples_leaf=100000, 
                              min_weight_fraction_leaf=0.3, 
                              random_state=42)

params_grid = {
    "n_estimators": [2, 3],
    "max_depth": [4, 5],
    "min_samples_leaf": [100000, 200000],
}

search = RandomizedSearchCV(model, params_grid, n_iter=1, cv=[[list(range(len(X_train))), list(range(len(X_test)))]], 
                            scoring="neg_mean_squared_log_error")
search.fit(pd.concat([X_train, X_test]).reset_index(drop=True), pd.concat([y_train, y_test]).reset_index(drop=True))

In [None]:
dict(filter(lambda x: x[0] in params_grid.keys(), search.best_estimator_.get_params().items()))

In [None]:
search.best_score_ * (-1)

In [None]:
print(1)

### XGBoost

In [None]:
%%time
boost = xgb.XGBRegressor(max_depth=4, 
                        learning_rate=0.1, 
                        n_estimators=10, 
                        silent=True, 
                        objective='reg:linear', 
                        random_state=42)
boost.fit(X_train.head(100000), y_train.head(100000), eval_metric='rmse', verbose=True, eval_set=[(X_test.head(10000), y_test.head(10000))],
              early_stopping_rounds=100)
mean_squared_log_error(y_test.head(10000), boost.predict(X_test.head(10000)), squared=False)

In [None]:
def rmsle(predictions, dmat):
    labels = dmat.get_label()
    diffs = np.log(predictions + 1) - np.log(labels + 1)
    squared_diffs = np.square(diffs)
    avg = np.mean(squared_diffs)
    return ('RMSLE', np.sqrt(avg))

model = xgb.XGBRegressor(max_depth=4, 
                        learning_rate=0.1, 
                        n_estimators=10, 
                        silent=True, 
                        objective='reg:linear', 
                        random_state=42)

params_grid = {
    "max_depth": [4, 5],
    "lambda": [0.0, 0.1],
}

search = RandomizedSearchCV(model, params_grid, n_iter=1, cv=[[list(range(len(X_train))), list(range(len(X_test)))]], 
                            scoring="neg_mean_squared_log_error")
search.fit(pd.concat([X_train, X_test]).reset_index(drop=True), pd.concat([y_train, y_test]).reset_index(drop=True),
           eval_metric=rmsle, verbose=True, eval_set=[(X_test, y_test)], early_stopping_rounds=100)

In [None]:
dict(filter(lambda x: x[0] in params_grid.keys(), search.best_estimator_.get_params().items()))

In [None]:
search.best_score_ * (-1)

## Model Predict

In [None]:
train_target = train["adjusted_demand"]
train.drop(columns=["week_number", "sales_unit_this_week", "sales_this_week", "returns_unit_next_week", "returns_next_week", "adjusted_demand"], 
                     inplace=True)

test.drop(columns=["week_number"], inplace=True)

In [None]:
%%time
model = RandomForestRegressor(n_estimators=2, 
                              max_depth=5, 
                              min_samples_leaf=100000, 
                              min_weight_fraction_leaf=0.3, 
                              random_state=42)
model.fit(train, train_target)

In [None]:
preds = model.predict(test)

In [None]:
sample_submission[TARGET] = preds
sample_submission.to_csv("submission.csv", index=False)

In [None]:
gc.collect()
print(1)

In [None]:
%%time
boost = xgb.XGBRegressor(max_depth=4, 
                        learning_rate=0.1, 
                        n_estimators=10, 
                        silent=True, 
                        objective='reg:linear', 
                        random_state=42)

boost.fit(X_train, y_train, eval_metric='rmse', verbose=True, eval_set=[(X_test, y_test)], early_stopping_rounds=100)
preds = boost.predict(test)

In [None]:
model = RandomForestRegressor()

In [None]:
model = xgb.XGBRegressor()

In [None]:
model.fit