In [None]:
import numpy as np 
import pandas as pd 
from category_encoders import MEstimateEncoder
from category_encoders import TargetEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from copy import deepcopy

from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer


from path import Path
import warnings 
warnings.filterwarnings('ignore') # supress warnings

In [None]:
def evaluate_model(model_name, model_pipeline, _X, _y, features, original_data=None, use_original=False, n_splits=5, random_state_list=[0, 5, 10], verbose=True):
    len_y = len(_y)
    len_states = len(random_state_list)

    oof_preds = np.zeros(len_y * len_states).reshape(len_states, len_y)
    models_pipeline = []
    scores_train = []

    for index, random_state in enumerate(random_state_list):
        if verbose:
            print("#"*25)
            print("#"*15, f"traininng model {model_name} with seed {random_state}")
            print("#"*25)
        splitter = Splitter(n_splits=n_splits)
        splits = 0
        for X_train, X_val, y_train, y_val, train_idx, val_idx in splitter.split_data(_X, _y, random_state):
    
            
            if use_original: # we will only use original data for training not testing
                target = 'cost'
                X_train = pd.concat([X_train, original_data.drop(target, axis=1)]) 
                y_train = pd.concat([y_train, np.log(original_data[target])]) # only for 

            model_pipeline.fit(X_train, y_train)
            oof_preds[index, val_idx] = model_pipeline.predict(X_val).squeeze()
            models_pipeline.append(deepcopy(model_pipeline))

            score_train = mean_squared_log_error(y_train, model_pipeline.predict(X_train))
            scores_train.append(score_train)

            score_valid_split = mean_squared_log_error(y_val, model_pipeline.predict(X_val).squeeze())
            splits += 1
            if verbose:
                print(f"seed {random_state} and split {splits} score {score_valid_split}")

            
    oof_preds_mean = oof_preds.mean(axis=0)

    return models_pipeline, oof_preds_mean, np.mean(scores_train), mean_squared_log_error(_y, oof_preds.mean(axis=0))


def predict_test(models_pipeline, X_test, n_splits=5, n_repeats=3):
    test_preds = np.zeros(n_splits * n_repeats * len(X_test)).reshape(n_splits * n_repeats, len(X_test))
    
    for index, model_pipeline in enumerate(models_pipeline):
        X_test_ = X_test.copy()
        preds = model_pipeline .predict(X_test_)                
        test_preds[index, range(len(preds))] = preds
        
    return test_preds.mean(axis=0)


def plot_importance(models, X_test, title=""):
#     taken from https://www.kaggle.com/code/shoabahamed/ps3e9-eda-and-gbdt-catboost-median-duplicatedata/edit
    """Plots features importance given models and train set"""
    features = X_test.columns.tolist()
    feature_importance = pd.DataFrame()
    for model in models:
        _df = pd.DataFrame()
        _df['importance'] = model.feature_importances_
        _df["features"] = pd.Series(features)
        _df = _df.sort_values(by='importance', ascending=False)
        feature_importance = pd.concat([feature_importance, _df])
        
                
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    plt.figure(figsize=(16, 10))
    ax = sns.barplot(x='importance', y='features', data=feature_importance, color='skyblue', errorbar='sd')
    
    for i in ax.containers:
        ax.bar_label(i,)
    
   
    plt.xlabel('Importance', fontsize=14)
    plt.ylabel('Feature', fontsize=14)
    plt.title(f"{title} Feature Importances", fontsize=18)
    plt.grid(True, axis='x')
    plt.show()
    
    return feature_importance



def mean_squared_log_error(y_true_log, y_pred_log):
    return mean_squared_error(y_true_log, y_pred_log, squared=False)


class Splitter:
    """A splitter class which splits the X, y using the split_data function with a random state provided. It yeilds \
    X_train, X_val, y_train, y_val, train_idx, val_idx in the end.\
    code from  https://www.kaggle.com/code/tetsutani/ps3e9-eda-and-gbdt-catboost-median-duplicatedata wit little bit of modification """

    def __init__(self, test_size=0.2, kfold=True, n_splits=5):
        self.test_size = test_size # set test size
        self.kfold = kfold  # wheter to just split the data in two or use kfold
        self.n_splits= n_splits # set 
        
    def split_data(self, X, y, random_state):
        if self.kfold:
            kf = KFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
            for train_idx, val_idx in kf.split(X, y):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                yield X_train, X_val, y_train, y_val, train_idx, val_idx
        else:
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.test_size, random_state=random_state)
            yield X_train, X_val, y_train, y_val

            

def submission_csv(predictions, target='cost'):
    df = pd.DataFrame()
    df['id'] = test['id']
    df[target] = np.exp(predictions) # exp only for this rmsle
    
    return df

In [None]:
# loading datasets
path = Path("/kaggle/input/playground-series-s3e11")

train = pd.read_csv(path / "train.csv")
test = pd.read_csv(path / "test.csv")
sub = pd.read_csv(path / "sample_submission.csv")


original_train = pd.read_csv("/kaggle/input/media-campaign-cost-prediction/train_dataset.csv")
original_test = pd.read_csv("/kaggle/input/media-campaign-cost-prediction/test_dataset.csv")
original = pd.concat([original_train, original_test])
original = original[train.drop('id', axis=1).columns]

original = original[~original['cost'].isnull()] # removing nulls
# removing duplicates in original
original = original[~original.drop("cost", axis=1).duplicated()]

In [None]:
org = original.copy()
train_temp = train.copy()

org['generated'] = False
train_temp['generated'] = True

data = pd.concat([train_temp.drop('id', axis=1), org])
data = data[~data.drop(['cost', 'generated'], axis=1).duplicated(keep='first')]

data.head()

In [None]:
from functools import partial
xgb = partial(XGBRegressor, tree_method='gpu_hist', random_state=0)

In [None]:
def feature_engineer(df):
    df['extra_attraction'] = df['florist'] + df['video_store'] + df['prepared_food'] + df['coffee_bar']
    df['florist*video'] = df['florist'] * df['video_store'] 
    df['children'] = df['total_children'] * df['num_children_at_home']
    df['children*avg_cars_at home(approx).1'] = df['children'] * df['avg_cars_at home(approx).1']
    df['stays_home'] = (df['total_children'] / df['num_children_at_home']).replace([np.inf, -np.inf], 10).fillna(0)
    df['store_sqft_encode'] = df['store_sqft'].copy()
    
    return df

In [None]:
X_encode = train.drop(columns=['id', 'cost', 'salad_bar', 'gross_weight', 'low_fat', 'recyclable_package', 'units_per_case', 'store_sales(in millions)', 'unit_sales(in millions)'])
y_default = np.log(train['cost'])
features_default = X_encode.columns.tolist()
add_data = data.loc[data['generated'] == False]
add_data = add_data[features_default + ['cost']]

X_encode = feature_engineer(X_encode)
add_data = feature_engineer(add_data)
features_default = X_encode.columns.tolist()


pipeline = Pipeline(steps=[('encode', MEstimateEncoder(cols=['store_sqft_encode'], m=10)), 
                        ('model', xgb())])


_models_default, _oof_preds_default, mean_train_score_default, mean_valid_score_default = evaluate_model("XGBRegressor_default", pipeline, 
                                                                X_encode, y_default , features_default, use_original=True, original_data=add_data, n_splits=10, random_state_list=[0, 5, 10]) 

In [None]:
mean_valid_score_default

In [None]:
test_prep = feature_engineer(test.drop(columns=['id', 'salad_bar',  'gross_weight','low_fat', 'recyclable_package', 
                                                'units_per_case', 'store_sales(in millions)', 'unit_sales(in millions)']))

preds = predict_test(_models_default, test_prep, n_splits=10, n_repeats=3)

In [None]:
test_preds_df = submission_csv(preds)

In [None]:
test_preds_df.to_csv("xgb_encoded_improved1.csv", index=False)

****HyperParameter tuning****
search space taken from this notebook https://www.kaggle.com/code/aonzahaha/ps3e11-xgboost-tuning-permutation-importance

In [None]:
import optuna

In [None]:
def objective(trial):
    params = {
        'verbosity':0,
        'n_estimators': trial.suggest_int("n_estimators", 50, 300),
        'learning_rate': trial.suggest_float('learning_rate',1e-10,2.0),
        'max_depth':trial.suggest_int('max_depth', 2, 12),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0),
        'colsample_bytree':trial.suggest_float('colsample_bytree',1e-5,1.0),
        'min_child_weight':trial.suggest_int('min_child_weight',0,1),
        'booster':trial.suggest_categorical("booster", ["dart", "gbtree",'gblinear']),
        'sampling_method': trial.suggest_categorical('sampling_method',['uniform','gradient_based']),
        'grow_policy': trial.suggest_categorical('grow_policy',['depthwise','lossguide']),
    }
    
    pipeline = Pipeline(steps=[('encode', MEstimateEncoder(cols=['store_sqft_encode'], m=10)), 
                        ('model', xgb(**params))])


    _models_default, _oof_preds_default, mean_train_score_default, mean_valid_score_default = evaluate_model("XGBRegressor_default", pipeline, 
                                                                X_encode, y_default , features_default, use_original=True, original_data=add_data, n_splits=5, random_state_list=[0], verbose=False) 
    score = mean_valid_score_default
    return score

In [None]:
study = optuna.create_study(direction= "minimize")
study.optimize(objective, n_trials= 1)
trial = study.best_trial
print("Best Score: ", trial.value)
print("Best Params: ")
for key, value in trial.params.items():
    print("  {}: {}".format(key, value))

In [None]:
# with trial set to 50
params =  {'n_estimators': 283, 'learning_rate': 0.1378863496122908, 'max_depth': 8, 'lambda': 0.29752719929425836,
          'alpha': 0.5281928441650384, 'colsample_bytree': 0.8712750461457782, 
          'min_child_weight': 1, 'booster': 'gbtree', 'sampling_method': 'gradient_based', 'grow_policy': 'lossguide'}

In [None]:
pipeline = Pipeline(steps=[('encode', MEstimateEncoder(cols=['store_sqft_encode'], m=10)), 
                        ('model', xgb())])


_models_default, _oof_preds_default, mean_train_score_default, mean_valid_score_default = evaluate_model("XGBRegressor_default", pipeline, 
                                                                X_encode, y_default , features_default, use_original=True, original_data=add_data, n_splits=10, random_state_list=[0, 5, 10]) 

In [None]:
print("best score: ", mean_valid_score_default)
# print("best score: ", 0.2962262969063931)

In [None]:
preds_opt = predict_test(_models_default, test_prep, n_splits=10, n_repeats=3)

In [None]:
test_preds_df = submission_csv(preds_opt)
test_preds_df.to_csv("xgb_encoded_improved_opt_improved.csv", index=False)

****rounding values****

In [None]:
preds = np.exp(_oof_preds_default.copy())
preds_round = np.round(preds.copy(), 1)


print("Score without rounding: ", mean_squared_log_error(y_default, np.log(preds)))
print("Score after rounding: ", mean_squared_log_error(y_default, np.log(np.minimum(preds_round, preds))))

In [None]:
test_preds_round_opt = np.minimum(test_preds_df['cost'], np.round(test_preds_df['cost'], 1))

In [None]:
test_preds_df['cost'] = test_preds_round_opt
test_preds_df.to_csv("xgb_encoded_improved_opt_improved_rounded.csv", index=False)

In [None]:
# preds_opt_round = np.round(_oof_preds_default, 2)

In [None]:
# mean_squared_log_error(y_default, _oof_preds_default)

In [None]:
# mean_squared_log_error(y_default, np.minimum(_oof_preds_default, preds_opt_round))

In [None]:
# mean_squared_log_error(y_default, preds_opt_round)

In [None]:
# sns.distplot(np.exp(_oof_preds_default))
# sns.distplot(np.exp(preds_opt_round))

# plt.legend(['pred', 'pred_round'])

In [None]:
# sns.histplot(test_preds_df['cost'])