In [None]:
!pip install fitter ./input/fitter-1-71/fitter-1.7.1-py3-none-any.whl -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error

from fitter import Fitter
from scipy import stats

SEED = 42
n_splits = 5
n_estimators=200
early_stopping_rounds = 100

In [None]:
train_data = pd.read_csv(r"./input/playground-series-s5e1/train.csv")
test_data = pd.read_csv(r"./input/playground-series-s5e1/test.csv")
data = pd.read_csv(r"./input/playground-series-s5e1/sample_submission.csv")

print("train_data shape :",train_data.shape)
print("test_data shape :",test_data.shape)
print("data shape :",data.shape)

In [None]:
train_data.head()

In [None]:
train_data.describe()

In [None]:
train_data.isna().sum().sort_values(ascending=False)

In [None]:
train_data['country'].value_counts()

In [None]:
train_data = train_data.drop_duplicates()
train_data = train_data.dropna()
print("train_data shape :",train_data.shape)

In [None]:
test_data.head()

In [None]:
test_data.isna().sum().sort_values(ascending=False)

In [None]:
train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])

train_data['Year'] = train_data['date'].dt.year
train_data['Month'] = train_data['date'].dt.month
train_data['Day'] = train_data['date'].dt.day
train_data['day_of_week'] = train_data['date'].dt.day_name()

test_data['Year'] = test_data['date'].dt.year
test_data['Month'] = test_data['date'].dt.month
test_data['Day'] = test_data['date'].dt.day
test_data['day_of_week'] = test_data['date'].dt.day_name()

train_data.drop('date',axis=1,inplace=True)
test_data.drop('date',axis=1,inplace=True)

In [None]:
f = Fitter(train_data['num_sold'], distributions=['norm', 'lognorm', 'laplace', 'expon', 'gamma'])
f.fit()
print('Distribution of original training data:')
f.summary()
lognorm_params = f.fitted_param['lognorm']

In [None]:
train_data['num_sold'] = np.log(train_data['num_sold'])

In [None]:
train_data = train_data.drop('id', axis = 1)
num_cols = list(train_data.select_dtypes(exclude=['object']).columns.difference(['num_sold']))
cat_cols = list(train_data.select_dtypes(include=['object']).columns)

num_cols_test = list(test_data.select_dtypes(exclude=['object']).columns.difference(['id']))
cat_cols_test = list(test_data.select_dtypes(include=['object']).columns)

In [None]:
from sklearn.preprocessing import LabelEncoder
# Initialize LabelEncoder
label_encoders = {col: LabelEncoder() for col in cat_cols}

# Apply LabelEncoder to each categorical column
for col in cat_cols:
    train_data[col] = label_encoders[col].fit_transform(train_data[col])
    test_data[col] = label_encoders[col].transform(test_data[col])
    

In [None]:
from sklearn.model_selection import train_test_split
X = train_data.drop(['num_sold'], axis=1)
y = train_data['num_sold']
test = test_data.drop(['id'],axis=1)

# Split datainto training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define MAPE metric
def mape(y_true, y_pred):
    return mean_absolute_percentage_error(y_true, y_pred)

In [None]:
kf = KFold(n_splits, shuffle=True, random_state=SEED)
kf_splits = kf.split(X)
scores1 = []
test_preds1 = []

lgbm_params1 = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': n_estimators,
    'learning_rate': 0.08,
    'max_depth': 6,
    'reg_alpha': 1,
    'lambda_l2': 5,  
    'subsample': 1.0, 
    'seed': SEED,
    'verbose': -1,
    'device' : 'cpu' 
}

for i, (train_idx, val_idx) in enumerate(kf_splits):
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    train_lgbm_dataset = lgb.Dataset(X_train_fold, label=y_train_fold, categorical_feature=cat_cols)
    valid_lgbm_dataset = lgb.Dataset(X_val_fold, label=y_val_fold, categorical_feature=cat_cols)
    
    callbacks = [lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False)]
    model = lgb.LGBMRegressor(**lgbm_params1)
    model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], eval_metric='mape', callbacks=callbacks)
    
    val_pred = model.predict(X_val_fold, num_iteration=model.best_iteration_)
    score = mape(y_val_fold, val_pred)
    scores1.append(score)
    
    test_pred = np.maximum(model.predict(test, num_iteration=model.best_iteration_),0)
    test_preds1.append(test_pred)
    
    print(f'LightGBM Fold {i + 1} mape: {score}')

print(f'LightGBM mape: {np.mean(scores1):.4f};');

In [None]:
kf = KFold(n_splits, shuffle=True, random_state=SEED)
kf_splits = kf.split(X)
scores2 = []
test_preds2 = []

catboost_params2 = {
    'loss_function': 'RMSE',  
    'eval_metric': 'RMSE',   
    'learning_rate': 0.08,
    'iterations': n_estimators,
    'depth': 6,
    'l2_leaf_reg': 8,
    'min_data_in_leaf' : 2,
    'random_seed':SEED,
    'verbose':False,
    'task_type': 'CPU'
}

X_test_pool = Pool(test, cat_features=cat_cols)

for i, (train_idx, val_idx) in enumerate(kf_splits):
    model = CatBoostRegressor(**catboost_params2)
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=cat_cols)
    X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=cat_cols)
    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=False, early_stopping_rounds=early_stopping_rounds)
    val_pred = model.predict(X_valid_pool)
    score = mape(y_val_fold, val_pred)  
    scores2.append(score)
    test_pred = np.maximum(model.predict(X_test_pool),0)
    test_preds2.append(test_pred)
    print(f'CatBoost Fold {i + 1} mape: {score}')

print(f'CatBoost mape: {np.mean(scores2):.4f};');

In [None]:
y_preds = (np.mean(test_preds1, axis=0)+np.mean(test_preds2, axis=0))/2
y_preds = np.exp(y_preds)
y_preds = np.clip(y_preds,5,5939)
print('predict mean:',y_preds.mean())
print('predict median',np.median(y_preds))

In [None]:
f = Fitter(y_preds, distributions=['norm', 'lognorm', 'laplace', 'expon', 'gamma'])
f.fit()
print('Distribution of predicting data:')
f.summary()   
norm_params = f.fitted_param['norm']

In [None]:
# Save predictions for submission
submission = pd.DataFrame({'id': test_data['id'], 'num_sold': y_preds})
submission.to_csv('submission.csv', index=False)
print(submission.head())