In [1]:
import xgboost as xgb
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from datetime import datetime

from model_utils import PreProcess
from constants import Y_LABEL, X_LABELS

## Get Train and test data 

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
train["is_training"] = True
test["is_training"] = False
train_test = pd.concat([train, test])


In [3]:
p = PreProcess(data=train_test, label_encoder_cols=["country", "store", "product"])
train, test = p.train_test


# xgb

In [4]:
X_train, X_val, y_train, y_val = train_test_split(
    train[X_LABELS], train[Y_LABEL], test_size=0.2, random_state=42
)


In [5]:
model = xgb.XGBRegressor(n_estimators=1000)
model.fit(X_train, y_train)


In [6]:
y_pred = model.predict(X_val)
print("the r2 is: ", round(metrics.r2_score(y_pred=y_pred, y_true=y_val), 2))
print("the mse is: ", round(metrics.mean_squared_error(y_pred=y_pred, y_true=y_val), 2))


the r2 is:  0.97
the mse is:  543.26


# Test predictions

In [7]:
test_preds = model.predict(test[X_LABELS])


In [8]:
sub = pd.read_csv('../data/sample_submission.csv')
sub["num_sold"] = test_preds
sub.to_csv(f"../data/submission_{datetime.now()}.csv", index = False)
