In [None]:
#this notebook compares m-encoding and simple encoding for categorical features in the
#jan 2022 tabular playground competition
#while simple-encoding gets a slightly better cross val score, m-encoding gets a better competition score
#this is only a first submission - the next step is more complicated feature engineering

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from datetime import datetime

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv', index_col='row_id')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv', index_col='row_id')

train.head()

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
train['date'] = pd.to_datetime(train.date, format='%Y-%m-%d')
test['date'] = pd.to_datetime(test.date, format='%Y-%m-%d')

train

**Feature Engineering Based off of the Date**

In [None]:
dayOfWeek={'Monday':0, 'Tuesday':1, 'Wednesday':2, 'Thursday':3, 'Friday':4, 'Saturday':5, 'Sunday':6}
months={'January':0, 'February':1, 'March':2, 'April':3, 'May':4, 'June':5, 'July':6, 'August':7, 'September':8,
       'October':9, 'November':10, 'December':11}

train['day_of_week'] = train['date'].dt.day_name().map(dayOfWeek)
test['day_of_week'] = test['date'].dt.day_name().map(dayOfWeek)

train['month'] = train['date'].dt.month_name().map(months)
test['month'] = test['date'].dt.month_name().map(months)

train['is_wknd'] = [1 if n in [4,5,6] else 0 for n in train['day_of_week']]
test['is_wknd'] = [1 if n in [4, 5, 6] else 0 for n in test['day_of_week']]

In [None]:
train = train.drop(['date'], axis=1)
test = test.drop(['date'], axis=1)

train.info()

In [None]:
train

In [None]:
test

**M-Encoding**

In [None]:
from category_encoders import MEstimateEncoder

In [None]:
# Encoding split
X_encode = train.sample(frac=0.20, random_state=0)
y_encode = X_encode.pop('num_sold')

# Training split
X_pretrain = train.drop(X_encode.index)
y_train = X_pretrain.pop('num_sold')

In [None]:
encoder = MEstimateEncoder(cols=['country', 'store', 'product'],
                          m=0.5)

encoder.fit(X_encode, y_encode)

X_train = encoder.transform(X_pretrain)

In [None]:
m_model = XGBRegressor(n_estimators = 250, learning_rate = 0.02, random_state=0)

m_scores = -1 * cross_val_score(m_model, X_train, y_train,
                                cv=5,
                                scoring='neg_root_mean_squared_error')

print(m_scores)
print('m encoding scores: ', m_scores.mean())

In [None]:
m_model.fit(X_train, y_train)

m_test = encoder.transform(test)

#output of m_preds

m_preds = m_model.predict(m_test)

In [None]:
m_output = pd.DataFrame({'row_id' : range(26298, 32868),
                       'num_sold' : m_preds})

m_output.to_csv('m_encode_submission.csv', index=False)

**Simple Encoding**

In [None]:
train["product"]=train["product"].replace({"Kaggle Mug":0,"Kaggle Hat":1,"Kaggle Sticker":2})
train["store"]=train["store"].replace({"KaggleMart":0,"KaggleRama":1})
train["country"]=train["country"].replace({"Finland":0,"Norway":1,"Sweden":2})

test["product"]=test["product"].replace({"Kaggle Mug":0,"Kaggle Hat":1,"Kaggle Sticker":2})
test["store"]=test["store"].replace({"KaggleMart":0,"KaggleRama":1})
test["country"]=test["country"].replace({"Finland":0,"Norway":1,"Sweden":2})

In [None]:
y = train.num_sold
X = train.drop(['num_sold'], axis=1)

In [None]:
s_model = XGBRegressor(n_estimators=175, learning_rate=0.2, random_state=0)

s_scores = -1 * cross_val_score(s_model, X, y,
                                cv=5,
                                scoring='neg_root_mean_squared_error')

print(s_scores.mean())

In [None]:
s_model.fit(X, y)


s_preds = s_model.predict(test)

In [None]:
s_output = pd.DataFrame({'row_id' : range(26298, 32868),
                       'num_sold' : s_preds})

s_output.to_csv('simple_submission.csv', index=False)