<a href="https://colab.research.google.com/github/rieszpeti/SalesForecasting/blob/main/SalesForecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install -Uq upgini catboost

[K     |████████████████████████████████| 89 kB 1.7 MB/s 
[K     |████████████████████████████████| 76.6 MB 28 kB/s 
[K     |████████████████████████████████| 1.6 MB 16.1 MB/s 
[K     |████████████████████████████████| 2.0 MB 26.5 MB/s 
[K     |████████████████████████████████| 12.2 MB 38.1 MB/s 
[?25h

In [4]:
from os.path import exists
import pandas as pd

df_path = "train.csv.zip" if exists("train.csv.zip") else "https://github.com/upgini/upgini/raw/main/notebooks/train.csv.zip"
df = pd.read_csv(df_path)
df = df.sample(n=19_000, random_state=0)
df["store"] = df["store"].astype(str)
df["item"] = df["item"].astype(str)

df["date"] = pd.to_datetime(df["date"])

df.sort_values("date", inplace=True)
df.reset_index(inplace=True, drop=True)
df.head()


Unnamed: 0,date,store,item,sales
0,2013-01-01,7,5,5
1,2013-01-01,4,9,19
2,2013-01-01,1,33,37
3,2013-01-01,3,41,14
4,2013-01-01,5,24,26


In [5]:
train = df[df["date"] < "2017-01-01"]
test = df[df["date"] >= "2017-01-01"]

In [6]:
train_features = train.drop(columns=["sales"])
train_target = train["sales"]
test_features = test.drop(columns=["sales"])
test_target = test["sales"]

In [7]:
from upgini import FeaturesEnricher, SearchKey
from upgini.metadata import CVType

enricher = FeaturesEnricher(
    search_keys= {
        "date": SearchKey.DATE,
    },
    cv = CVType.time_series
)
enricher.fit(train_features,
             train_target,
             eval_set=[(test_features, test_target)])

<IPython.core.display.Javascript object>

Detected task type: ModelTaskType.REGRESSION


Column name,Status,Description
date,All valid,All values in this column are good to go
target,All valid,All values in this column are good to go


Running search request with search_id=c1167555-1d1f-4378-9a5e-87878e2c7d23
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

[92m[1m
25 relevant feature(s) found with the search keys: ['date'][0m


provider,source,feature name,shap value,coverage %,type,feature type
,,item,0.488656,100.0,categorical,
,,store,0.172407,100.0,categorical,
Upgini,Public/Comm. shared,f_weather_date_weather_pca_0_d7e0a1fc,0.056252,100.0,numerical,Free
Upgini,Public/Comm. shared,f_events_date_week_sin1_847b5db1,0.047397,100.0,numerical,Free
Upgini,Public/Comm. shared,f_events_date_week_cos1_f6a8c1fc,0.030201,100.0,numerical,Free
Upgini,Public/Comm. shared,f_weather_date_weather_umap_48_b39cd0c4,0.025656,100.0,numerical,Free
Upgini,Public/Comm. shared,f_weather_date_weather_umap_24_2e14c9a6,0.018649,100.0,numerical,Free
Upgini,Public/Comm. shared,f_weather_date_weather_umap_33_89bb7578,0.015129,100.0,numerical,Free
Upgini,Public/Comm. shared,f_events_date_year_cos1_9014a856,0.013152,100.0,numerical,Free
Upgini,Public/Comm. shared,f_financial_date_silver_14e835ea,0.007449,100.0,numerical,Free


In [8]:
from catboost import CatBoostRegressor
from catboost.utils import eval_metric

model = CatBoostRegressor(verbose=False, allow_writing_files=False, random_state=0)

enricher.calculate_metrics(
    train_features, train_target,
    eval_set=[(test_features, test_target)],
    estimator = model,
    scoring = "mean_absolute_percentage_error"
)

Calculating metrics...
Done


Unnamed: 0,match_rate,baseline mean_absolute_percentage_error,enriched mean_absolute_percentage_error,uplift
,,,,
train,100.0,0.254322,0.166509,0.087813
eval 1,100.0,0.267351,0.185123,0.082227


In [9]:
enriched_train_features = enricher.transform(train_features, keep_input=True)
enriched_test_features = enricher.transform(test_features, keep_input=True)
enriched_train_features.head()



Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=cd2e0993-1379-41dd-9ac9-d2d598a674b4
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Collecting selected features...
Done


Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=48e9e0d0-1332-4dcf-bb9f-bf18df729858
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Collecting selected features...
Done


Unnamed: 0,date,store,item,f_weather_date_weather_pca_0_d7e0a1fc,f_events_date_week_sin1_847b5db1,f_events_date_week_cos1_f6a8c1fc,f_weather_date_weather_umap_48_b39cd0c4,f_weather_date_weather_umap_24_2e14c9a6,f_weather_date_weather_umap_33_89bb7578,f_events_date_year_cos1_9014a856,...,f_events_date_italy_game_cnt_99570b80,f_financial_date_nasdaq_c568533e,f_financial_date_dow_jones_7d_to_7d_1y_shift_61f71e90,f_economic_date_cbpol_pca_3_27450634,f_financial_date_finance_umap_3_516aa6cd,f_economic_date_cbpol_umap_6_aa0352de,f_economic_date_cbpol_umap_1_7eb7a343,f_weather_date_weather_umap_34_c3ef5b4f,f_weather_date_weather_umap_45_d474bf8d,f_economic_date_cpi_umap_4_970cc061
0,2013-01-01,7,5,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,0,3019.51001,1.065267,-0.323471,6.598458,1.367325,4.815701,5.664261,4.923654,10.153208
1,2013-01-01,4,9,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,0,3019.51001,1.065267,-0.323471,6.598458,1.367325,4.815701,5.664261,4.923654,10.153208
2,2013-01-01,1,33,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,0,3019.51001,1.065267,-0.323471,6.598458,1.367325,4.815701,5.664261,4.923654,10.153208
3,2013-01-01,3,41,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,0,3019.51001,1.065267,-0.323471,6.598458,1.367325,4.815701,5.664261,4.923654,10.153208
4,2013-01-01,5,24,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,0,3019.51001,1.065267,-0.323471,6.598458,1.367325,4.815701,5.664261,4.923654,10.153208


In [10]:
model.fit(train_features, train_target)
preds = model.predict(test_features)
eval_metric(test_target.values, preds, "SMAPE")

[37.65141857448004]

In [11]:
model.fit(enriched_train_features, train_target)
enriched_preds = model.predict(enriched_test_features)
eval_metric(test_target.values, enriched_preds, "SMAPE")

[14.300508728887994]