In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

# from reg_funcs import *

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split, StratifiedKFold
from xgboost import XGBRegressor
from copy import deepcopy
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from functools import partial
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from category_encoders import MEstimateEncoder, TargetEncoder

import eli5
from eli5.sklearn import PermutationImportance
from sklearn.metrics import make_scorer
from sklearn.inspection import permutation_importance
from sklearn.metrics import RocCurveDisplay, auc
from functools import partial

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer




from path import Path
import warnings 
warnings.filterwarnings('ignore') # supress warnings

In [2]:
# need to change evaluation method a little
def plot_importance(models, X_test, title=""):
#     taken from https://www.kaggle.com/code/shoabahamed/ps3e9-eda-and-gbdt-catboost-median-duplicatedata/edit
    """Plots features importance given models and train set"""
    features = X_test.columns.tolist()
    feature_importance = pd.DataFrame()
    for model in models:
        _df = pd.DataFrame()
        _df['importance'] = model.feature_importances_
        _df["features"] = pd.Series(features)
        _df = _df.sort_values(by='importance', ascending=False)
        feature_importance = pd.concat([feature_importance, _df])
        
                
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    plt.figure(figsize=(16, 10))
    ax = sns.barplot(x='importance', y='features', data=feature_importance, color='skyblue', errorbar='sd')
    
    for i in ax.containers:
        ax.bar_label(i,)
    
   
    plt.xlabel('Importance', fontsize=14)
    plt.ylabel('Feature', fontsize=14)
    plt.title(f"{title} Feature Importances", fontsize=18)
    plt.grid(True, axis='x')
    plt.show()
    
    return feature_importance



def mean_squared_log_error(y_true_log, y_pred_log):
    return mean_squared_error(y_true_log, y_pred_log, squared=False)


class Splitter:
    """A splitter class which splits the X, y using the split_data function with a random state provided. It yeilds \
    X_train, X_val, y_train, y_val, train_idx, val_idx in the end.\
    code from  https://www.kaggle.com/code/tetsutani/ps3e9-eda-and-gbdt-catboost-median-duplicatedata wit little bit of modification """

    def __init__(self, test_size=0.2, kfold=True, n_splits=5):
        self.test_size = test_size # set test size
        self.kfold = kfold  # wheter to just split the data in two or use kfold
        self.n_splits= n_splits # set 
        
    def split_data(self, X, y, random_state):
        if self.kfold:
            kf = KFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
            for train_idx, val_idx in kf.split(X, y):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                yield X_train, X_val, y_train, y_val, train_idx, val_idx
        else:
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.test_size, random_state=random_state)
            yield X_train, X_val, y_train, y_val

            

def submission_csv(predictions, target='cost'):
    df = pd.DataFrame()
    df['id'] = test['id']
    df[target] = np.exp(predictions) # exp only for this rmsle
    
    return df

In [42]:
# loading datasets
path = Path("/kaggle/input/playground-series-s3e11")

train = pd.read_csv(path / "train.csv")
test = pd.read_csv(path / "test.csv")
sub = pd.read_csv(path / "sample_submission.csv")


original_train = pd.read_csv("/kaggle/input/media-campaign-cost-prediction/train_dataset.csv")
original_test = pd.read_csv("/kaggle/input/media-campaign-cost-prediction/test_dataset.csv")
original = pd.concat([original_train, original_test])
original = original[train.drop('id', axis=1).columns]

original = original[~original['cost'].isnull()] # removing nulls
# removing duplicates in original
original = original[~original.drop("cost", axis=1).duplicated()]

In [4]:
org = original.copy()
train_temp = train.copy()

org['generated'] = False
train_temp['generated'] = True

data = pd.concat([train_temp.drop ('id', axis=1), org])
data = data[~data.drop(['cost', 'generated'], axis=1).duplicated(keep='first')]

original = data.loc[data['generated'] == False]

In [5]:
train_cat = train.copy()
original_cat = original.copy()
test_cat = test.copy()

In [6]:
def feature_engineer(df):
    df['extra_attraction'] = df['florist'] + df['video_store'] + df['prepared_food'] + df['coffee_bar']
    df['florist*video'] = df['florist'] * df['video_store'] 
    df['children'] = df['total_children'] * df['num_children_at_home']
    df['children*avg_cars_at home(approx).1'] = df['children'] * df['avg_cars_at home(approx).1']
    df['stays_home'] = (df['total_children'] / df['num_children_at_home']).replace([np.inf, -np.inf], 10).fillna(0)
    df['store_sqft_encode'] = df['store_sqft'].copy()
    
    return df

In [7]:
def evaluate_model(model_name, model_pipeline, _X, _y, features, original_data=None, use_original=False, n_splits=5, random_state_list=[0, 5, 10], verbose=True):
    len_y = len(_y)
    len_states = len(random_state_list)

    oof_preds = np.zeros(len_y * len_states).reshape(len_states, len_y)
    models_pipeline = []
    scores_train = []

    for index, random_state in enumerate(random_state_list):
        if verbose:
            print("#"*25)
            print("#"*15, f"traininng model {model_name} with seed {random_state}")
            print("#"*25)
        splitter = Splitter(n_splits=n_splits)
        splits = 0
        for X_train, X_val, y_train, y_val, train_idx, val_idx in splitter.split_data(_X, _y, random_state):
    
            
            if use_original: # we will only use original data for training not testing
                target = 'cost'
                X_train = pd.concat([X_train, original_data.drop(target, axis=1)]) 
                y_train = pd.concat([y_train, np.log(original_data[target])]) # only for 

#             model_pipeline.fit(X_train, y_train, model__eval_set=[(X_val, y_val)], model__verbose=False)
            model_pipeline.fit(X_train, y_train)
            oof_preds[index, val_idx] = model_pipeline.predict(X_val).squeeze()
            models_pipeline.append(deepcopy(model_pipeline))

            score_train = mean_squared_log_error(y_train, model_pipeline.predict(X_train))
            scores_train.append(score_train)

            score_valid_split = mean_squared_log_error(y_val, model_pipeline.predict(X_val).squeeze())
            splits += 1
            if verbose:
                print(f"seed {random_state} and split {splits} score {score_valid_split}")

            
    oof_preds_mean = oof_preds.mean(axis=0)

    return models_pipeline, oof_preds_mean, np.mean(scores_train), mean_squared_log_error(_y, oof_preds.mean(axis=0))


def predict_test(models_pipeline, X_test, n_splits=5, n_repeats=3):
    test_preds = np.zeros(n_splits * n_repeats * len(X_test)).reshape(n_splits * n_repeats, len(X_test))
    
    for index, model_pipeline in enumerate(models_pipeline):
        X_test_ = X_test.copy()
        preds = model_pipeline .predict(X_test_)                
        test_preds[index, range(len(preds))] = preds
        
    return test_preds.mean(axis=0)


<h3> Preivously Discovered </h3>

In [34]:
X_encode = train.drop(columns=['id', 'cost', 'salad_bar', 'gross_weight', 'low_fat', 'recyclable_package', 'units_per_case', 'store_sales(in millions)', 'unit_sales(in millions)'])
y_default = np.log(train['cost'])
features_default = X_encode.columns.tolist()
add_data = original.copy()
add_data = add_data[features_default + ['cost']]

X_encode = feature_engineer(X_encode)
add_data = feature_engineer(add_data)
features_default = X_encode.columns.tolist()

xgb = partial(XGBRegressor, tree_method='gpu_hist', random_state=0)
# xgb = partial(XGBRegressor, random_state=0)
pipeline = Pipeline(steps=[('encode', MEstimateEncoder(cols=['store_sqft_encode'], m=10)), ('scaler', StandardScaler()), 
                        ('model', xgb())])

_models_pipeline, _oof_preds, mean_train_score, mean_valid_score = evaluate_model("XGBRegressor_default", pipeline, X_encode, y_default, 
                                features_default, use_original=True, original_data=add_data, n_splits=10, random_state_list=[0, 5, 10]) 

#########################
############### traininng model XGBRegressor_default with seed 0
#########################
seed 0 and split 1 score 0.2982257503745061
seed 0 and split 2 score 0.2965535960517836
seed 0 and split 3 score 0.2976026107718961
seed 0 and split 4 score 0.2972252132496084
seed 0 and split 5 score 0.2975305914113704
seed 0 and split 6 score 0.29610541439266663
seed 0 and split 7 score 0.29712386312189587
seed 0 and split 8 score 0.29611013343017284
seed 0 and split 9 score 0.2990231092913298
seed 0 and split 10 score 0.2973258561858942
#########################
############### traininng model XGBRegressor_default with seed 5
#########################
seed 5 and split 1 score 0.29785116135656076
seed 5 and split 2 score 0.2974146549765205
seed 5 and split 3 score 0.2957588193691142
seed 5 and split 4 score 0.29522606432503146
seed 5 and split 5 score 0.2970132197746259
seed 5 and split 6 score 0.2964645237215187
seed 5 and split 7 score 0.2979372815390011
seed 5 and s

In [12]:
mean_valid_score, mean_train_score

(0.29712906359792274, 0.29671401670447534)

test---------------------

In [29]:
# train_sample = train.sample(frac=0.25, random_state=0)

(90084, 17)

In [8]:
params_xgb1 =  {'n_estimators': 283, 'learning_rate': 0.1378863496122908, 'max_depth': 8, 'lambda': 0.29752719929425836,
          'alpha': 0.5281928441650384, 'colsample_bytree': 0.8712750461457782, 
          'min_child_weight': 1, 'booster': 'gbtree', 'sampling_method': 'gradient_based', 'grow_policy': 'lossguide'}

In [9]:
X_encode = train.drop(columns=['id', 'cost', 'salad_bar', 'gross_weight', 'low_fat', 'recyclable_package', 'units_per_case', 'store_sales(in millions)', 'unit_sales(in millions)'])
y_default = np.log(train['cost'])
features_default = X_encode.columns.tolist()
add_data = original.copy()
add_data = add_data[features_default + ['cost']]

X_encode = feature_engineer(X_encode)
add_data = feature_engineer(add_data)
features_default = X_encode.columns.tolist()

xgb = partial(XGBRegressor, tree_method='gpu_hist', random_state=0)
pipeline = Pipeline(steps=[('encode', MEstimateEncoder(cols=['store_sqft_encode'], m=10)), ('scaler', StandardScaler()), 
                        ('model', xgb(**params_xgb1))])

_models_pipeline_xgb1, _oof_preds_xgb1, mean_train_score, mean_valid_score = evaluate_model("XGBRegressor_default", pipeline, X_encode, y_default, 
                                features_default, use_original=True, original_data=add_data, n_splits=10, random_state_list=[0, 5, 10]) 

#########################
############### traininng model XGBRegressor_default with seed 0
#########################
seed 0 and split 1 score 0.29735882853038187
seed 0 and split 2 score 0.2954728798463515
seed 0 and split 3 score 0.2966736547273064
seed 0 and split 4 score 0.2965370284982709
seed 0 and split 5 score 0.29642877398971884
seed 0 and split 6 score 0.295125188567925
seed 0 and split 7 score 0.29594196642155934
seed 0 and split 8 score 0.29498551746642027
seed 0 and split 9 score 0.2980745197783254
seed 0 and split 10 score 0.29625174484667927
#########################
############### traininng model XGBRegressor_default with seed 5
#########################
seed 5 and split 1 score 0.29689808364429615
seed 5 and split 2 score 0.2963381269464857
seed 5 and split 3 score 0.2951042780959754
seed 5 and split 4 score 0.29437962152129404
seed 5 and split 5 score 0.2958099801869722
seed 5 and split 6 score 0.29572089833377274
seed 5 and split 7 score 0.2965636748148115
seed 5 and

In [10]:
print("Base score: ", mean_valid_score)
# 0.2990453317642123

Base score:  0.2962262969063931


In [11]:
test_prep = feature_engineer(test.drop(columns=['id', 'salad_bar',  'gross_weight','low_fat', 'recyclable_package', 
                                                'units_per_case', 'store_sales(in millions)', 'unit_sales(in millions)']))

preds1 = predict_test(_models_pipeline_xgb1, test_prep, n_splits=10, n_repeats=3)

In [13]:
preds1_df = submission_csv(preds1)

In [14]:
preds2_df = pd.read_csv("/kaggle/input/submission-file/submission_jumpa.csv")


In [20]:
test_preds = preds1_df.copy()
test_preds['cost'] = (preds1_df['cost'] + preds2_df['cost'])/2

In [26]:
test_preds.to_csv("final_submission.csv", index=False)

<h3>Feature from this notebook</h3>


https://www.kaggle.com/code/janmpia/feature-eng-xgb-cat-ensemble-0-29265/notebook

In [45]:
avg_df = pd.DataFrame(index = train.store_sqft.unique())
avg_df['store_sqft'] = avg_df.index


concat_train_hold_test = pd.concat([train, test, original],ignore_index=True)
for feature in ['units_per_case','store_sales(in millions)','total_children']:
    avg_df[f'avg_{feature}'] = concat_train_hold_test.groupby('store_sqft')[feature].mean()
    

train = pd.merge(train, avg_df, on='store_sqft', how='left')
original = pd.merge(original, avg_df, on='store_sqft', how='left')
test = pd.merge(test, avg_df, on='store_sqft', how='left')
avg_df.head()

Unnamed: 0,store_sqft,avg_units_per_case,avg_store_sales(in millions),avg_total_children
36509.0,36509.0,18.914932,6.462912,2.457264
28206.0,28206.0,18.921846,3.660089,2.352837
21215.0,21215.0,18.948922,6.445384,2.38832
27694.0,27694.0,19.024974,6.492038,2.378048
33858.0,33858.0,18.945019,6.514354,2.542395


In [40]:
X_encode = train.drop(columns=['id', 'cost', 'salad_bar', 'gross_weight', 'low_fat', 'recyclable_package', 'units_per_case', 'store_sales(in millions)', 'unit_sales(in millions)'])
y_default = np.log(train['cost'])
features_default = X_encode.columns.tolist()
add_data = original.copy()
add_data = add_data[features_default + ['cost']]

X_encode = feature_engineer(X_encode)
add_data = feature_engineer(add_data)

features_default = X_encode.columns.tolist()

xgb = partial(XGBRegressor, tree_method='gpu_hist', random_state=0)

pipeline = Pipeline(steps=[('encode', MEstimateEncoder(cols=['store_sqft_encode'], m=10)), ('scaler', StandardScaler()), 
                        ('model', xgb())])

_models_pipeline, _oof_preds, mean_train_score, mean_valid_score = evaluate_model("XGBRegressor_default", pipeline, X_encode, y_default, 
                                features_default, use_original=True, original_data=add_data, n_splits=10, random_state_list=[0, 5, 10]) 

#########################
############### traininng model XGBRegressor_default with seed 0
#########################
seed 0 and split 1 score 0.298118891943223
seed 0 and split 2 score 0.29618748027069797
seed 0 and split 3 score 0.297376689106019
seed 0 and split 4 score 0.2970804105478922
seed 0 and split 5 score 0.2972563318949389
seed 0 and split 6 score 0.29581433174383975
seed 0 and split 7 score 0.2967742022960868
seed 0 and split 8 score 0.29610211912830653
seed 0 and split 9 score 0.29887643924160107
seed 0 and split 10 score 0.2971726044327491
#########################
############### traininng model XGBRegressor_default with seed 5
#########################
seed 5 and split 1 score 0.29767738192595405
seed 5 and split 2 score 0.2971352455136717
seed 5 and split 3 score 0.2955714553228859
seed 5 and split 4 score 0.2952990576175415
seed 5 and split 5 score 0.29671983691488
seed 5 and split 6 score 0.2962683864979506
seed 5 and split 7 score 0.29774215807506754
seed 5 and spli

In [41]:
mean_valid_score

0.2969533681718434

In [None]:
test_prep = feature_engineer(test.drop(columns=['id', 'salad_bar',  'gross_weight','low_fat', 'recyclable_package', 
                                                'units_per_case', 'store_sales(in millions)', 'unit_sales(in millions)']))

preds = predict_test(_models_default, test_prep, n_splits=10, n_repeats=3)

****optimizing the new features****

In [50]:
import optuna

In [43]:
xgb = partial(XGBRegressor, tree_method='gpu_hist', random_state=0)
def objective(trial):
    params = {
        'verbosity':0,
        'n_estimators': trial.suggest_int("n_estimators", 50, 300),
        'learning_rate': trial.suggest_float('learning_rate',1e-10,2.0),
        'max_depth':trial.suggest_int('max_depth', 2, 12),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0),
        'colsample_bytree':trial.suggest_float('colsample_bytree',1e-5,1.0),
        'min_child_weight':trial.suggest_int('min_child_weight',0,1),
        'booster':trial.suggest_categorical("booster", ["dart", "gbtree",'gblinear']),
        'sampling_method': trial.suggest_categorical('sampling_method',['uniform','gradient_based']),
        'grow_policy': trial.suggest_categorical('grow_policy',['depthwise','lossguide']),
    }
    
    pipeline = Pipeline(steps=[('encode', MEstimateEncoder(cols=['store_sqft_encode'], m=10)),('scaler', StandardScaler()),
                        ('model', xgb(**params))])


    _models_default, _oof_preds, mean_train_score, mean_valid_score = evaluate_model("XGBRegressor_default", pipeline, 
                                                                X_encode, y_default , features_default, use_original=True, original_data=add_data, n_splits=5, random_state_list=[0], verbose=False)  
    
        
    score = mean_valid_score
    
    return score

In [44]:
study = optuna.create_study(direction= "minimize")
study.optimize(objective, n_trials= 2)
trial = study.best_trial
print("Best Score: ", trial.value)
print("Best Params: ")
for key, value in trial.params.items():
    print("  {}: {}".format(key, value))

[32m[I 2023-04-02 14:33:09,611][0m A new study created in memory with name: no-name-a0d2df08-f284-41a8-8a63-b9719222b091[0m
[32m[I 2023-04-02 14:33:46,751][0m Trial 0 finished with value: 0.3124113639316865 and parameters: {'n_estimators': 159, 'learning_rate': 1.8932189957283772, 'max_depth': 10, 'lambda': 0.011501149416024757, 'alpha': 0.6020798267202936, 'colsample_bytree': 0.938528993467161, 'min_child_weight': 0, 'booster': 'gblinear', 'sampling_method': 'uniform', 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.3124113639316865.[0m
[32m[I 2023-04-02 14:34:25,301][0m Trial 1 finished with value: 0.3096759342986648 and parameters: {'n_estimators': 60, 'learning_rate': 0.8127720674504577, 'max_depth': 10, 'lambda': 0.02917584285220111, 'alpha': 0.1154237095318977, 'colsample_bytree': 0.019201520900104026, 'min_child_weight': 1, 'booster': 'dart', 'sampling_method': 'uniform', 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.3096759342986648.[0m


Best Score:  0.3096759342986648
Best Params: 
  n_estimators: 60
  learning_rate: 0.8127720674504577
  max_depth: 10
  lambda: 0.02917584285220111
  alpha: 0.1154237095318977
  colsample_bytree: 0.019201520900104026
  min_child_weight: 1
  booster: dart
  sampling_method: uniform
  grow_policy: lossguide


In [45]:
# got after 40 iterations 
params_xgb2 = {'n_estimators': 185, 'learning_rate': 0.13282537347943735, 'max_depth': 9, 'lambda': 0.780694355262925, 
           'alpha': 0.590831380880124, 'colsample_bytree': 0.9027454594970407, 'min_child_weight': 0, 
           'booster': 'gbtree', 'sampling_method': 'uniform', 'grow_policy': 'lossguide'}

In [46]:
X_encode = train.drop(columns=['id', 'cost', 'salad_bar', 'gross_weight', 'low_fat', 'recyclable_package', 'units_per_case', 'store_sales(in millions)', 'unit_sales(in millions)'])
y_default = np.log(train['cost'])
features_default = X_encode.columns.tolist()
add_data = original.copy()
add_data = add_data[features_default + ['cost']]

X_encode = feature_engineer(X_encode)
add_data = feature_engineer(add_data)

features_default = X_encode.columns.tolist()

xgb = partial(XGBRegressor, tree_method='gpu_hist', random_state=0)
# xgb = partial(XGBRegressor, random_state=0)
pipeline = Pipeline(steps=[('encode', MEstimateEncoder(cols=['store_sqft_encode'], m=10)), ('scaler', StandardScaler()),
                        ('model', xgb(**params_xgb2))])

_models_pipeline_xgb2, _oof_preds_xgb2, mean_train_score, mean_valid_score = evaluate_model("XGBRegressor_default", pipeline, X_encode, y_default, 
                                features_default, use_original=True, original_data=add_data, n_splits=10, random_state_list=[0, 5, 10]) 

#########################
############### traininng model XGBRegressor_default with seed 0
#########################
seed 0 and split 1 score 0.2973748981107558
seed 0 and split 2 score 0.29546228242094896
seed 0 and split 3 score 0.29666700442863575
seed 0 and split 4 score 0.29651311475945963
seed 0 and split 5 score 0.29644451411909634
seed 0 and split 6 score 0.29511018731944677
seed 0 and split 7 score 0.2959381056165846
seed 0 and split 8 score 0.2950072812484513
seed 0 and split 9 score 0.2980841766338719
seed 0 and split 10 score 0.29621966130849314
#########################
############### traininng model XGBRegressor_default with seed 5
#########################
seed 5 and split 1 score 0.29686768227993826
seed 5 and split 2 score 0.2963301433800437
seed 5 and split 3 score 0.2951457024455514
seed 5 and split 4 score 0.29439395591016315
seed 5 and split 5 score 0.29581693832731626
seed 5 and split 6 score 0.29573190461796584
seed 5 and split 7 score 0.29650625533332176
seed 5

In [23]:
 mean_valid_score

0.2962232307592296


****Combining previous two models(_oof_preds_xgb2 and _oof_preds_xgb1)****

In [78]:
print(f"xgb1 score {mean_squared_log_error(y_default, _oof_preds_xgb1)}")
print(f"xgb1 score {mean_squared_log_error(y_default, _oof_preds_xgb2)}")



preds = pd.DataFrame(index=range(len(train)))
rmse1 = 0.2962262969063931
rmse2 = 0.2962232307592296

preds['predictions'] = 0
rmse_sum = (1/rmse1) + (1/rmse2)
preds['predictions'] = preds['predictions'] + _oof_preds_xgb1 * (1/rmse1) + _oof_preds_xgb2 * (1/rmse2)

preds['predictions'] = preds / rmse_sum


print(f"combined score {mean_squared_log_error(y_default, preds['predictions'])}")

xgb1 score 0.2962262969063931
xgb1 score 0.2962232307592296
combined score 0.29621862919102143


In [55]:
test_prep = feature_engineer(test.drop(columns=['id', 'salad_bar',  'gross_weight','low_fat', 'recyclable_package', 
                                                'units_per_case', 'store_sales(in millions)', 'unit_sales(in millions)',
                                               'avg_units_per_case', 'avg_store_sales(in millions)',
                                               'avg_total_children']))

test_preds1 = predict_test(_models_pipeline_xgb1, test_prep, n_splits=10, n_repeats=3)
test_preds_df1 = submission_csv(test_preds1)

In [56]:
test_prep = feature_engineer(test.drop(columns=['id', 'salad_bar',  'gross_weight','low_fat', 'recyclable_package', 
                                                'units_per_case', 'store_sales(in millions)', 'unit_sales(in millions)']))

test_preds2 = predict_test(_models_pipeline_xgb2, test_prep, n_splits=10, n_repeats=3)
test_preds_df2 = submission_csv(test_preds2)


In [81]:
test_preds_comb = (test_preds1  * (1/rmse1) + test_preds2 * (1/rmse2))/ rmse_sum

In [85]:
test_preds_comb_df = submission_csv(test_preds_comb)
test_preds_comb_df.to_csv("X_encode_new_features_opt+pred_features_opt.csv", index=False)

****Target Encoding practice****

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:

df = pd.DataFrame({"a": [1, 2, 3, 1], "b": [4, 5, 6, 6], "c": [7, 8, 9, 3]})

In [18]:
class CustomEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['a_mean'] = X['a'] ** 3
        return X

In [23]:
pipe = Pipeline(
    steps=[
        ("use_custom_transformer", CustomEncoder())
    ]
)



encoder = CustomEncoder()
encoder.fit_transform(df)

Unnamed: 0,a,b,c,a_mean
0,1,4,7,1
1,2,5,8,8
2,3,6,9,27


In [27]:
class MultipyColumns(BaseEstimator, TransformerMixin):
    
    def __init__(self, by=1, columns=None):
        self.by = by
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        cols_to_transform = X.columns.tolist()
        
        if self.columns:
            cols_to_transform = self.columns
            
        cols_to_transform_name = [col+"_encoded" for col in cols_to_transform]
            
        X[cols_to_transform_name] = X[cols_to_transform] * self.by
        return X
    

encoder = MultipyColumns(3, columns=['a', 'c'])
encoder.fit_transform(df)

Unnamed: 0,a,b,c,a_mean,a_encoded,c_encoded
0,1,4,7,1,3,21
1,2,5,8,8,6,24
2,3,6,9,27,9,27


In [31]:
from sklearn.preprocessing import OrdinalEncoder

class CustomOrdinalEncoder(OrdinalEncoder):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        
    def transform(self, X, y=None):
        transformed_X = super().transform(X)
        new_X = pd.DataFrame(transformed_X, columns=self.feature_names_in_)
        return new_X
    

    
data = pd.DataFrame(
    {
        "fruits": ["Apple", "Pears", "Cherry"],
        "colors": ["Green", "Green", "Red"],
    }
)
encoder = OrdinalEncoder()
encoder.fit_transform(data)
        

array([[0., 0.],
       [2., 0.],
       [1., 1.]])

In [32]:
encoder = CustomOrdinalEncoder()
encoder.fit_transform(data)

Unnamed: 0,fruits,colors
0,0.0,0.0
1,2.0,0.0
2,1.0,1.0


In [39]:
class MultipyColumns(BaseEstimator, TransformerMixin):
    
    def __init__(self, by=1, columns=None):
        self.by = by
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        cols_to_transform = X.columns.tolist()
        
        if self.columns:
            cols_to_transform = self.columns
            
        cols_to_transform_name = [col+"_encoded" for col in cols_to_transform]
            
        X['a_encode'] = X.groupby('a').transform('mean')
        return X
    

encoder = MultipyColumns(3, columns=['a', 'c'])
encoder.fit_transform(df)

ValueError: Wrong number of items passed 5, placement implies 1

In [40]:
index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301],
                  'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
                  index=index)
df

Unnamed: 0,http_status,response_time
Firefox,200,0.04
Chrome,200,0.02
Safari,404,0.07
IE10,404,0.08
Konqueror,301,1.0


In [41]:
df.reindex()

Unnamed: 0,http_status,response_time
Firefox,200,0.04
Chrome,200,0.02
Safari,404,0.07
IE10,404,0.08
Konqueror,301,1.0


In [74]:
X_train = train.drop(['id', 'cost'], axis=1)
y_train = train['cost']

X_train['store_sqft_encode'] = X_train['store_sqft'].copy()

# y_train.groupby(X_train['store_sqft'].reindex()).mean()

In [83]:
class CustomTargetEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns=None):
        self.column = columns
    
    def fit(self, X, y=None):
        self.encodings_ = y.groupby(X['store_sqft']).mean()
        return self
        
    def transform(self, X, y=None):
        X = X.copy()
        X['store_sqft_encode'] = self.encodings_.reindex(X['store_sqft']).values
        
        return X
    
cte = CustomTargetEncoder()
cte.fit_transform(X_train, y_train)

Unnamed: 0,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,recyclable_package,low_fat,units_per_case,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,store_sqft_encode
0,8.61,3.0,2.0,2.0,2.0,10.30,1.0,0.0,32.0,36509.0,0.0,0.0,0.0,0.0,0.0,118.037663
1,5.00,2.0,4.0,0.0,3.0,6.66,1.0,0.0,1.0,28206.0,1.0,0.0,0.0,0.0,0.0,109.283034
2,14.08,4.0,0.0,0.0,3.0,21.30,1.0,0.0,26.0,21215.0,1.0,0.0,0.0,0.0,0.0,103.419639
3,4.02,3.0,5.0,0.0,0.0,14.80,0.0,1.0,36.0,21215.0,1.0,0.0,0.0,0.0,0.0,103.419639
4,2.13,3.0,5.0,0.0,3.0,17.00,1.0,1.0,20.0,27694.0,1.0,1.0,1.0,1.0,1.0,84.607090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360331,7.60,4.0,5.0,5.0,3.0,13.50,1.0,0.0,33.0,30268.0,0.0,0.0,0.0,0.0,0.0,96.282094
360332,14.44,4.0,4.0,0.0,4.0,18.80,1.0,1.0,18.0,20319.0,0.0,0.0,0.0,0.0,0.0,106.078977
360333,10.74,3.0,0.0,0.0,2.0,11.30,1.0,0.0,35.0,30584.0,1.0,1.0,1.0,1.0,1.0,103.443026
360334,11.04,3.0,1.0,0.0,3.0,10.20,0.0,1.0,14.0,30584.0,1.0,1.0,1.0,1.0,1.0,103.443026


In [85]:
from category_encoders import TargetEncoder

te = TargetEncoder(cols='store_sqft_encode')
te.fit_transform(X_train, y_train)

Unnamed: 0,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,recyclable_package,low_fat,units_per_case,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,store_sqft_encode
0,8.61,3.0,2.0,2.0,2.0,10.30,1.0,0.0,32.0,36509.0,0.0,0.0,0.0,0.0,0.0,118.037663
1,5.00,2.0,4.0,0.0,3.0,6.66,1.0,0.0,1.0,28206.0,1.0,0.0,0.0,0.0,0.0,109.283034
2,14.08,4.0,0.0,0.0,3.0,21.30,1.0,0.0,26.0,21215.0,1.0,0.0,0.0,0.0,0.0,103.419639
3,4.02,3.0,5.0,0.0,0.0,14.80,0.0,1.0,36.0,21215.0,1.0,0.0,0.0,0.0,0.0,103.419639
4,2.13,3.0,5.0,0.0,3.0,17.00,1.0,1.0,20.0,27694.0,1.0,1.0,1.0,1.0,1.0,84.607090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360331,7.60,4.0,5.0,5.0,3.0,13.50,1.0,0.0,33.0,30268.0,0.0,0.0,0.0,0.0,0.0,96.282094
360332,14.44,4.0,4.0,0.0,4.0,18.80,1.0,1.0,18.0,20319.0,0.0,0.0,0.0,0.0,0.0,106.078977
360333,10.74,3.0,0.0,0.0,2.0,11.30,1.0,0.0,35.0,30584.0,1.0,1.0,1.0,1.0,1.0,103.443026
360334,11.04,3.0,1.0,0.0,3.0,10.20,0.0,1.0,14.0,30584.0,1.0,1.0,1.0,1.0,1.0,103.443026
