# Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, datetime
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

# Data path

In [None]:
train_path = '../input/tabular-playground-series-jan-2022/train.csv'
test_path = '../input/tabular-playground-series-jan-2022/test.csv'

# Load data

In [None]:
df = pd.read_csv(train_path,index_col='row_id')
test_df = pd.read_csv(test_path,index_col='row_id')

# Take a glance

In [None]:
df.head()

In [None]:
country_finland = df['country']=='Finland'
store_mart = df['store']=='KaggleMart'
product_mug = df['product']=='Kaggle Mug'
condition = country_finland & store_mart & product_mug

df_date = df.copy()
df_date['date'] = pd.to_datetime(df_date['date'])

plt.figure(figsize=(30,8))
sns.lineplot(x='date',y='num_sold',data=df_date[condition])
plt.show()

# Feature engineering
- From the diagram above, we can see that the number of sold product increase sharply around the beginning of each year
- Furthermore, it is easy to understand that the number of customer at the weekend also higher than in weekday
- Because of those reason, I convert the `date` column into `year`, `month`, `day`, `weekday` and `weekend` 

In [None]:
def convertToTime(x):
    return datetime.strptime(x,'%Y-%m-%d').timestamp()

def getDay(x):
    return int(str(x).split('-')[2])

def getMonth(x):
    return int(str(x).split('-')[1])

def getYear(x):
    return int(str(x).split('-')[0])

def getWeekday(x):
    return int(datetime.strptime(x,'%Y-%m-%d').weekday())

df['year'] = df['date'].apply(getYear)
df['month'] = df['date'].apply(getMonth)
df['day'] = df['date'].apply(getDay)
df['weekday'] = df['date'].apply(getWeekday)
df['weekend'] = (df['weekday']>4).astype(int)
# df['date'] = df['date'].apply(convertToTime)
df.drop('date',inplace=True,axis=1)
# df.drop('year',inplace=True,axis=1)

test_df['year'] = test_df['date'].apply(getYear)
test_df['month'] = test_df['date'].apply(getMonth)
test_df['day'] = test_df['date'].apply(getDay)
test_df['weekday'] = test_df['date'].apply(getWeekday)
test_df['weekend'] = (test_df['weekday']>4).astype(int)
# test_df['date'] = test_df['date'].apply(convertToTime)
test_df.drop('date',inplace=True,axis=1)
# test_df.drop('year',inplace=True,axis=1)

df.head()

# Split dataset into train and validation

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop("num_sold",axis=1)
y = df['num_sold']

X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2)

# Column types

In [None]:
cat_col = [c for c in X_train.columns if X_train[c].dtype in ['object']]
num_col = [c for c in X_train.columns if X_train[c].dtype in ['float64','int64']]

print("*"*20)
print("Columns:")
print("Category:",cat_col)
print("Numeric:",num_col)
print("*"*20)

print("Unique values")
for col in cat_col:
    print(col,X_train[col].unique())

# Missing values

In [None]:
print("*"*20)
print("Missing value:")
print(X_train.isnull().sum())
print("*"*20)

# Standarize and encoding

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

standarizer = StandardScaler()
scaler = MinMaxScaler()

standard_col = ['year', 'month', 'day', 'weekday', 'weekend']

if (len(standard_col)):
    X_train[standard_col] = standarizer.fit_transform(X_train[standard_col])
    X_valid[standard_col] = standarizer.transform(X_valid[standard_col])

X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)

X_train.head()

# Create machine learning model

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor()
model.fit(X_train,y_train)

# Evaluate with MAE

In [None]:
from sklearn.metrics import mean_absolute_error

valid_predicts = model.predict(X_valid)
print(mean_absolute_error(valid_predicts,y_valid))

# Fine-tune with grid search and cross-validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

def create_pipeline(n_est):
    num_trans = Pipeline(steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("standard",StandardScaler())
    ])
    cat_trans = Pipeline(steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("onehot",OneHotEncoder(handle_unknown='ignore'))
    ])
    preprocessor = ColumnTransformer(transformers=[
        ('numeric',num_trans,standard_col),
        ('catergory',cat_trans,cat_col)
    ])
    pipe = Pipeline(steps=[
        ('preprocess',preprocessor),
        ('model',XGBRegressor(n_estimators=n_est))
    ])
    return pipe

def test_score(n_est):
    pipe = create_pipeline(n_est)
    scores = cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_absolute_error')    
    return -scores.mean()

n_est = list(range(10,201,10))
scores = []

for est in n_est:
    print("Testing:",est,end=' - ')
    scores.append(test_score(est))
    print(scores[-1])
    
plt.plot(n_est,scores)
plt.show()

# Create submission

In [None]:
final_model = create_pipeline(n_est[np.argmin(scores)])
final_model.fit(X,y)
test_predict = final_model.predict(test_df)

result = pd.DataFrame({
    "row_id": test_df.index,
    'num_sold': test_predict
})
print(result.head())
result.to_csv('submission.csv',index=False)