In [1]:
import pandas as pd 
import numpy as np
import datetime
import holidays
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import AdaBoostRegressor
import lightgbm as lgb
from xgboost import XGBRegressor

In [2]:
train = pd.read_csv("../input/tabular-playground-series-sep-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2022/test.csv")
sample_sub = pd.read_csv("../input/tabular-playground-series-sep-2022/sample_submission.csv")

See EDA in my other [notebook](https://www.kaggle.com/code/robertturro/plotly-visuals-tps-sep-2022)

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70128 entries, 0 to 70127
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   row_id    70128 non-null  int64 
 1   date      70128 non-null  object
 2   country   70128 non-null  object
 3   store     70128 non-null  object
 4   product   70128 non-null  object
 5   num_sold  70128 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 3.2+ MB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17520 entries, 0 to 17519
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   row_id   17520 non-null  int64 
 1   date     17520 non-null  object
 2   country  17520 non-null  object
 3   store    17520 non-null  object
 4   product  17520 non-null  object
dtypes: int64(1), object(4)
memory usage: 684.5+ KB


In [5]:
def transform_data(Data):
    data = Data.copy()
    data['parsedDate'] = pd.to_datetime(data['date'],format="%Y-%m-%d")
    data['Day'] = data['parsedDate'].dt.day
    data['Month'] = data['parsedDate'].dt.month
    data['Year'] = data['parsedDate'].dt.year
    data['Weekday'] = data['parsedDate'].dt.weekday
    data['DayOfYear'] = data['parsedDate'].dt.dayofyear
    data["month_sin"] = np.sin(data['Month']*(2*np.pi/12))
    data["month_cos"] = np.cos(data['Month']*(2*np.pi/12))
    
    for d in range(0,7):
        data['day_'+str(d)] = 0
        data.loc[data['Weekday']==d,'day_'+str(d)] =1
    
    data["holiday"] = data["DayOfYear"].apply(lambda x: x if x in [1,2,3,4,5,6,7,8,125,126,360,361,362,363,364,365] else 0)
    data = pd.get_dummies(data,columns=['holiday'],drop_first=True)
    
    # Remove abnormal data during COVID, as suggested by Cabaxiom
    data = data.loc[~((data["date"] >= "2020-03-01") & (data["date"] < "2020-06-01"))]
    
    data = data.drop(columns=["parsedDate",'Month'])
    data = data.set_index("date")
    data.index = pd.to_datetime(data.index)
    
    encoder = OrdinalEncoder()
    for col in data.columns:
        if data[col].dtype == "object" and col != "type" and col != "date":
            data[col] = encoder.fit_transform(data[[col]])
    
    data = data.drop(columns=["row_id"])
    

    return data

In [6]:
train_data = transform_data(train)
test_data = transform_data(test)

In [7]:
train_data.head()

Unnamed: 0_level_0,country,store,product,num_sold,Day,Year,Weekday,DayOfYear,month_sin,month_cos,...,holiday_7,holiday_8,holiday_125,holiday_126,holiday_360,holiday_361,holiday_362,holiday_363,holiday_364,holiday_365
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,0.0,0.0,0.0,663,1,2017,6,1,0.5,0.866025,...,0,0,0,0,0,0,0,0,0,0
2017-01-01,0.0,0.0,1.0,615,1,2017,6,1,0.5,0.866025,...,0,0,0,0,0,0,0,0,0,0
2017-01-01,0.0,0.0,2.0,480,1,2017,6,1,0.5,0.866025,...,0,0,0,0,0,0,0,0,0,0
2017-01-01,0.0,0.0,3.0,710,1,2017,6,1,0.5,0.866025,...,0,0,0,0,0,0,0,0,0,0
2017-01-01,0.0,1.0,0.0,240,1,2017,6,1,0.5,0.866025,...,0,0,0,0,0,0,0,0,0,0


In [8]:
y_train = train_data['num_sold']
train_data = train_data.drop(['num_sold'],axis=1)

In [9]:
def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.nanmean(diff)

In [10]:
def get_params(df):
    mini = min(df['mean_test_score'])
    for row in range(len(df)):
        if df.iloc[row]['mean_test_score'] == mini:
            return df.iloc[row]['params']

In [11]:
def find_model(train,y_train):
    scores = {}
    output = []
    
    rf = RandomizedSearchCV(RandomForestRegressor(),{
    'n_estimators' : [500,1000]
    }, scoring = 'neg_mean_absolute_error',return_train_score=False)

    rf.fit(train,y_train)

    rf_df = pd.DataFrame(rf.cv_results_)
    rf_score = max(rf_df['mean_test_score'])
    scores['rf'] = rf_score
    
    ada = RandomizedSearchCV(AdaBoostRegressor(),{
    'n_estimators' : [500,1000],
    }, scoring = 'neg_mean_absolute_error',return_train_score=False)

    ada.fit(train,y_train)
    
    new_df_ada = pd.DataFrame(ada.cv_results_)
    ada_score = max(new_df_ada['mean_test_score'])
    scores['ada'] = ada_score
    
    cat = RandomizedSearchCV(CatBoostRegressor(verbose=0),{
    'n_estimators' : [500,1000]
    
    }, scoring = 'neg_mean_absolute_error',return_train_score=False)
    
    cat.fit(train,y_train,verbose=None)

    new_df_cat = pd.DataFrame(cat.cv_results_)
    cat_score = max(new_df_cat['mean_test_score'])
    scores['cat'] = cat_score
    
    gb = RandomizedSearchCV(lgb.LGBMRegressor(objective='regression',metric='L2',boosting_type='gbdt'),{
        'n_estimators' : [500,1000]
    
    },scoring = 'neg_mean_absolute_error',return_train_score=False)
        
    gb.fit(train,y_train)

    gb_df = pd.DataFrame(gb.cv_results_)
    gb_score = max(gb_df['mean_test_score'])
    scores['gb'] = gb_score
        
    xgb = RandomizedSearchCV(XGBRegressor(),{
    'n_estimators' : [500,1000]
    
    }, scoring = 'neg_mean_absolute_error',return_train_score=False)
    
    xgb.fit(train,y_train,verbose=None)

    new_df_xgb = pd.DataFrame(xgb.cv_results_)
    xgb_score = max(new_df_xgb['mean_test_score'])
    scores['xgb'] = xgb_score
        

    max_val = list(scores.values())
    max_ke = list(scores.keys())
    best_model = max_ke[max_val.index(max(max_val))]
    output.append(best_model)
    
    if best_model == 'cat':
        output.append(get_params(new_df_cat))
    if best_model == 'ada':
        output.append(get_params(new_df_ada))
    if best_model == 'rf':
        output.append(get_params(rf_df))
    if best_model == 'gb':
        output.append(get_params(gb_df))
    if best_model == 'xgb':
        output.append(get_params(new_df_xgb))
    
    
    return output

In [12]:
best_model = find_model(train_data,y_train)



In [13]:
model_name = best_model[0]
parameters = best_model[1]

In [14]:
print(model_name)
print(parameters)

gb
{'n_estimators': 500}


In [15]:
def make_model(model_name,parameters,train,y_train):
    if model_name == 'cat':
        model = CatBoostRegressor(**parameters)
        model.fit(train,y_train)
        
    if model_name == 'ada':
        model = AdaBoostRegressor(**parameters)
        model.fit(train,y_train)
        
    if model_name == 'rf':
        model = RandomForestRegressor(**parameters)
        model.fit(train,y_train)
        
    if model_name == 'gb':
        model = lgb.LGBMRegressor(**parameters)
        model.fit(train,y_train)
        
    if model_name == 'xgb':
        model = XGBRegressor(**parameters)
        model.fit(train,y_train)
        
    return model

In [16]:
model = make_model(model_name,parameters,train_data,y_train)
predictions = model.predict(test_data)

In [17]:
sample_sub['num_sold'] = predictions

In [18]:
predictions.shape

(17520,)

In [19]:
test_data.shape

(17520, 32)

In [20]:
sample_sub.shape

(17520, 2)

In [21]:
sample_sub.to_csv('submission.csv',index=False)