### Setup

In [1]:
import warnings
import os
import copy
from sklearnex import patch_sklearn
import numpy as np
import pandas as pd
from category_encoders import (
    BackwardDifferenceEncoder,
    BaseNEncoder,
    BinaryEncoder,
    CatBoostEncoder,
    CountEncoder,
    GLMMEncoder,
    HelmertEncoder,
    JamesSteinEncoder,
    LeaveOneOutEncoder,
    MEstimateEncoder,
    SummaryEncoder,
    TargetEncoder,
    WOEEncoder,
)

warnings.filterwarnings("ignore")

import uuid
from sklearn.experimental import enable_iterative_imputer
from sklearn import set_config
from sklearn.base import clone as model_clone
from sklearn.cluster import *
from sklearn.impute import *
from sklearn.compose import *
from sklearn.cross_decomposition import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.feature_selection import *
from sklearn.gaussian_process import *
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.multioutput import *
from sklearn.multiclass import *
from sklearn.naive_bayes import *
from sklearn.neighbors import *
from sklearn.neural_network import *
from sklearn.pipeline import *
from sklearn.preprocessing import *
from sklearn.kernel_approximation import *
from sklearn.svm import *
from sklearn.tree import *
from sklearn.utils import *
from sklearn.dummy import *
from sklearn.semi_supervised import *
from sklearn.discriminant_analysis import *
from sklearn.covariance import *
from collections import Counter
import sklearn
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.calibration import *
import joblib
from pprint import pprint as pp

pd.options.compute.use_numba = True
pd.options.compute.use_numexpr = True
pd.options.compute.use_bottleneck = True
pd.options.display.max_columns = 90
set_config(display="diagram")
warnings.filterwarnings("ignore")
import pickle
from collections import defaultdict
import matplotlib.pyplot as plt
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_imb_pipeline
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.under_sampling import InstanceHardnessThreshold
from imblearn.over_sampling import (
    ADASYN,
    SMOTE,
    RandomOverSampler,
    SVMSMOTE,
    SMOTENC,
    SMOTEN,
    BorderlineSMOTE,
    KMeansSMOTE,
)

%matplotlib inline
plt.style.use("fivethirtyeight")

import seaborn as sns

sns.set()
from joblib import parallel_backend
from joblib.memory import Memory

# patch_sklearn()
KAGGLE_ENV = 1
DATA_INPUT = "/kaggle/input/marketing-strategy-personalised-offer/"
DATA_OUTPUT = "/kaggle/working/"
cwd = os.path.abspath(os.getcwd())
if "mlop3n/Pycharm" in cwd or "u170690" in cwd:
    KAGGLE_ENV = 0
    DATA_INPUT = "kaggle/input/marketing-strategy-personalised-offer/"
    DATA_OUTPUT = "kaggle/working/"
CACHE = Memory(DATA_OUTPUT + "joblib", verbose=0)
patch_sklearn()

data = pd.read_csv(DATA_INPUT + "train_data.csv")
eval_data = pd.read_csv(DATA_INPUT + "test_data.csv")


def gen_train_test(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=10
    )
    return X_train, X_test, y_train, y_test


def save_data():
    global data, eval_data
    data.to_parquet(DATA_OUTPUT + "data.parquet")
    eval_data.to_parquet(DATA_OUTPUT + "eval_data.parquet")


def quick_test(X):
    clfs = [
        RandomForestClassifier(class_weight="balanced_subsample", random_state=42),
        DecisionTreeClassifier(class_weight="balanced", random_state=42),
        HistGradientBoostingClassifier(random_state=42),
        LogisticRegressionCV(max_iter=10000, class_weight="balanced", random_state=42),
    ]
    y = data.target
    X_train, X_test, y_train, y_test = gen_train_test(X, y, test_size=0.5)
    for clf in clfs:
        y_pred = clf.fit(X_train, y_train).predict(X_test)
        score = f1_score(y_test, y_pred, average="macro")
        print(f"{clf.__class__.__name__} :: {score}")


def check_RF_perf(X, y):
    clf = RandomForestClassifier(
        class_weight="balanced", n_jobs=24, max_features=None, max_depth=8
    )
    with parallel_backend("threading"):
        scores = cross_validate(
            clf,
            X,
            y,
            cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42),
            n_jobs=24,
            return_train_score=True,
            scoring="f1_macro",
        )
    _ = plt.plot(scores["test_score"], label="TEST")
    _ = plt.plot(scores["train_score"], label="TRAIN")
    _ = plt.legend()


def check_catNB_perf(X, y):
    min_c = X.nunique().astype("int").to_numpy() + 1
    # clf = RandomForestClassifier(class_weight='balanced',
    #                              n_jobs=24,
    # #                              max_features=None,
    #                              )
    class_prior = (y.value_counts() / X.shape[0]).sort_index().to_numpy()
    clf = CategoricalNB(
        fit_prior=True,
        alpha=0.0000003,
        min_categories=min_c,
        #                     class_prior=class_prior
    )
    categories_ = []
    for c in X.columns:
        categories_.append(sorted(list(X[c].unique())))

    work = make_pipeline(OrdinalEncoder(categories=categories_), clf)
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.5,
        random_state=42,
        shuffle=True,
        stratify=y,
    )
    with parallel_backend("threading"):
        #     scores = cross_validate(clf,X,y,cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42),n_jobs=24,return_train_score=True,scoring='f1_macro')
        y_pred = work.fit(X_train, y_train).predict(X_test)
        print(classification_report(y_pred, y_test))

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
def bring_data():
    data = pd.read_csv(DATA_INPUT + "train_data.csv")
    eval_data = pd.read_csv(DATA_INPUT + "test_data.csv")
    ques = "Do you want to use new data?"
    ans = input(prompt=ques)
    if ans.lower() == "y":

        def convert_to_hours(row):
            if "hours" in row["offer expiration"]:
                #         row['offer expiration'] = int(row['offer expiration'][:-5])
                row["offer expiration"] = 0
            elif "days" in row["offer expiration"]:
                #         row['offer expiration'] = int(row['offer expiration'][:-4])*24
                row["offer expiration"] = 1
            return row

        data = data.apply(convert_to_hours, axis=1)
        eval_data = eval_data.apply(convert_to_hours, axis=1)

        def pythonise_col_names(df: pd.DataFrame):
            col_names = list(df.columns)
            rename_stubs = {}
            for x in col_names:
                if x == "Offer Accepted":
                    r = "target"
                    rename_stubs[x] = r
                elif "/" in x:
                    r = x.split("/")[1]
                    r = r.replace(" ", "_").lower()
                    rename_stubs[x] = r
                elif "-" in x:
                    r = x.replace("-", "_").lower()
                    r = r.replace(" ", "_").lower()
                    rename_stubs[x] = r
                elif " " in x:
                    r = x.replace(" ", "_").lower()
                    rename_stubs[x] = r
                else:
                    r = x.lower()
                    rename_stubs[x] = r
            return rename_stubs

        data.rename(pythonise_col_names(data), inplace=True, axis=1)
        eval_data.rename(pythonise_col_names(eval_data), inplace=True, axis=1)
        gender_code = {"Male": 0, "Female": 1}

        def binarize_gender(row):
            global gender_code
            row["gender"] = gender_code[row["gender"]]
            return row

        data = data.apply(binarize_gender, axis=1)
        eval_data = eval_data.apply(binarize_gender, axis=1)

        ages_ord = ["below21", "21", "26", "31", "36", "41", "46", "50plus"]
        ordinal_age_codes = {x: idx + 1 for idx, x in enumerate(ages_ord)}
        income_codes = {
            "Less than ₹12500": 1,
            "₹12500 - ₹24999": 2,
            "₹25000 - ₹37499": 3,
            "₹37500 - ₹49999": 4,
            "₹50000 - ₹62499": 5,
            "₹62500 - ₹74999": 6,
            "₹75000 - ₹87499": 7,
            "₹87500 - ₹99999": 8,
            "₹100000 or More": 9,
        }
        ordinal_quali_codes = {
            "Bachelors degree": 5,
            "Some college - no degree": 3,
            "Graduate degree (Masters or Doctorate)": 6,
            "Associates degree": 4,
            "High School Graduate": 2,
            "Some High School": 1,
        }
        data["ordinal__income_range"] = 0
        eval_data["ordinal__income_range"] = 0
        data["ordinal__age"] = 0
        eval_data["ordinal__age"] = 0
        data["ordinal__qualif"] = 0
        eval_data["ordinal__qualif"] = 0

        def codify_special_ordinal_range(row):
            global income_codes, ordinal_age_codes
            row["ordinal__income_range"] = income_codes[row["income_range"]]
            row["ordinal__age"] = ordinal_age_codes[row["age"]]
            row["ordinal__qualif"] = ordinal_quali_codes[row["qualification"]]
            return row

        data = data.apply(codify_special_ordinal_range, axis=1)
        eval_data = eval_data.apply(codify_special_ordinal_range, axis=1)

        ordinal_category_codes = {
            "4~8": 4,
            "less1": 2,
            "never": 1,
            "1~3": 3,
            "gt8": 5,
            np.nan: np.nan,
        }

        # data = data.apply(codify_cold_drink_consumption,axis=1)
        # eval_data = eval_data.apply(codify_cold_drink_consumption,axis=1)
        ord_cols = [
            "restaur_spend_less_than20",
            "no_take_aways",
            "restaur_spend_greater_than20",
            "no_visited_bars",
            "no_visited_cold_drinks",
        ]
        col_ord_names = ["ordinal__" + x for x in ord_cols]
        data[col_ord_names] = 0
        eval_data[col_ord_names] = 0

        def codify_ordinal_columns(row):
            global col_ord_names, ordinal_category_codes, ord_cols
            for ord_col, col_ord_name in zip(ord_cols, col_ord_names):
                try:
                    row[col_ord_name] = int(ordinal_category_codes[row[ord_col]])
                except ValueError:
                    row[col_ord_name] = ordinal_category_codes[row[ord_col]]
            return row

        data = data.apply(codify_ordinal_columns, axis=1)
        eval_data = eval_data.apply(codify_ordinal_columns, axis=1)

        """
        Car Feature:
        Label NaN as unknown for now
        """
        data.car.fillna("unknown", inplace=True)
        eval_data.car.fillna("unknown", inplace=True)

        nominal_cs = [
            "restaurant_type",
            "marital_status",
            #         "climate",
            "drop_location",
            "job_industry",
            "customer_type",
            "car",
        ]
        nominal_col_names = ["nominal__" + x for x in nominal_cs]
        master_nominals = pd.concat(
            [data[nominal_cs], eval_data[nominal_cs]], ignore_index=True, axis=0
        )
        nominal_encoder = {nc: {} for nc in nominal_cs}
        for c in nominal_cs:
            unique_vals = list(master_nominals[c].unique())
            nominal_codes = {x: idx for idx, x in enumerate(unique_vals)}
            nominal_encoder[c] = nominal_codes
        data[nominal_col_names] = 0
        eval_data[nominal_col_names] = 0
        data["interval__season"] = 0
        eval_data["interval__season"] = 0
        season_code = {"Spring": 1, "Summer": 2, "Winter": 3}

        def assign_numeric_seasons(row):
            row["interval__season"] = season_code[row["climate"]]
            return row

        data = data.apply(assign_numeric_seasons, axis=1)
        eval_data = eval_data.apply(assign_numeric_seasons, axis=1)

        def codify_nominal_columns(row):
            global nominal_col_names, nominal_encoder, nominal_cs
            for nom_col, col_nom_name in zip(nominal_cs, nominal_col_names):
                try:
                    row[col_nom_name] = int(nominal_encoder[nom_col][row[nom_col]])
                except ValueError:
                    row[col_nom_name] = nominal_encoder[nom_col][row[nom_col]]
            return row

        data = data.apply(codify_nominal_columns, axis=1)
        eval_data = eval_data.apply(codify_nominal_columns, axis=1)

        def drop_const_and_enc_columns(df):
            cols_to_drop = [
                "restaur_spend_less_than20",
                "no_take_aways",
                "restaur_spend_greater_than20",
                "no_visited_bars",
                "no_visited_cold_drinks",
                "restaurant_type",
                "marital_status",
                "climate",
                "qualification",
                "drop_location",
                "job_industry",
                "customer_type",
                "car",
                "income_range",
                "travelled_more_than_5mins_for_offer",
                "age",
            ]
            df = df.drop(cols_to_drop, axis=1)
            return df

        data = drop_const_and_enc_columns(data)
        eval_data = drop_const_and_enc_columns(eval_data)

        data_original_col_order = [
            "offer_expiration",
            "ordinal__income_range",
            "ordinal__no_visited_cold_drinks",
            "travelled_more_than_15mins_for_offer",
            "ordinal__restaur_spend_less_than20",
            "nominal__marital_status",
            "nominal__restaurant_type",
            "ordinal__age",
            "prefer_western_over_chinese",
            "travelled_more_than_25mins_for_offer",
            "ordinal__no_visited_bars",
            "gender",
            "nominal__car",
            "restuarant_same_direction_house",
            "cooks_regularly",
            "nominal__customer_type",
            "ordinal__qualif",
            "is_foodie",
            "ordinal__no_take_aways",
            "nominal__job_industry",
            "restuarant_opposite_direction_house",
            "has_children",
            "visit_restaurant_with_rating_(avg)",
            "temperature",
            "ordinal__restaur_spend_greater_than20",
            "travel_time",
            "interval__season",
            "nominal__drop_location",
            "prefer_home_food",
            "target",
        ]
        eval_data_original_col_order = copy.deepcopy(data_original_col_order)
        eval_data_original_col_order.remove("target")

        data = data.loc[:, data_original_col_order]
        eval_data = eval_data.loc[:, eval_data_original_col_order]
        binary_features = [
            "offer_expiration",
            "travelled_more_than_15mins_for_offer",
            "prefer_western_over_chinese",
            "travelled_more_than_25mins_for_offer",
            "restuarant_same_direction_house",
            "cooks_regularly",
            "is_foodie",
            "restuarant_opposite_direction_house",
            "has_children",
            "prefer_home_food",
            "gender",
        ]
        b_f_rn = {x: "binary__" + x for x in binary_features}
        data.rename(b_f_rn, axis=1, inplace=True)
        eval_data.rename(b_f_rn, axis=1, inplace=True)
        remaining_ordinals = {
            "visit_restaurant_with_rating_(avg)": "ordinal__type_of_rest_rating",
            "temperature": "interval__temperature",
            "travel_time": "interval__travel_time",
        }

        def rename_remaining_ord(df):
            df = df.rename(remaining_ordinals, axis=1)
            return df

        data = rename_remaining_ord(data)
        eval_data = rename_remaining_ord(eval_data)

        target_codes = {"Yes": 1, "No": 0}

        def binarize_target(row):
            global target_codes
            row["target"] = target_codes[row["target"]]
            return row

        data = data.apply(binarize_target, axis=1)
        X = data[eval_data.columns]
        y = data.target
        X_eval = eval_data

        def impute_values(X, y, X_eval):
            imputer = IterativeImputer(
                estimator=RandomForestClassifier(
                    class_weight="balanced", random_state=42, n_jobs=-1
                ),
                sample_posterior=False,
                initial_strategy="most_frequent",
                random_state=42,
            )
            #     data.isna().sum()

            with parallel_backend("threading", n_jobs=24):
                data_enc = imputer.fit_transform(X)
                eval_data_enc = imputer.transform(eval_data)
            return data_enc, eval_data_enc

        data_enc, eval_data_enc = impute_values(X, y, X_eval)
        data.loc[:, eval_data.columns] = data_enc
        eval_data.loc[:, :] = eval_data_enc
        data = data.astype(np.uint32)
        eval_data = eval_data.astype(np.uint32)
        save_data()
    else:
        data = pd.read_parquet(DATA_OUTPUT + "data.parquet")
        eval_data = pd.read_parquet(DATA_OUTPUT + "eval_data.parquet")


    """
    Degree Of Closeness Rather than Nominal Drop Variable.
    """

    closeness_ranks = {0: 3, 1: 2, 2: 1}
    if "nominal__drop_location" in data.columns:

        def rank_closeness(row):
            """
            It makes sense to rename the nominal drop Location to much more meaningful
            Ordinal Variable based on General Closeness to target Pop.
            """
            row["nominal__drop_location"] = closeness_ranks[row["nominal__drop_location"]]
            return row

        data = data.apply(rank_closeness, axis=1)
        eval_data = eval_data.apply(rank_closeness, axis=1)
        data.rename(
            {"nominal__drop_location": "ordinal__dest_distance"}, axis=1, inplace=True
        )
        eval_data.rename(
            {"nominal__drop_location": "ordinal__dest_distance"}, axis=1, inplace=True
        )
        data.rename(
            {"nominal__drop_location": "ordinal__dest_distance"}, axis=1, inplace=True
        )
        eval_data.rename(
            {"nominal__drop_location": "ordinal__dest_distance"}, axis=1, inplace=True
        )


    def html_px(chart):
        f_name = str(uuid.uuid1())[:8] + ".html"
        f_dest = "/home/u164131/msiit/charts/"
        with open(f_dest + f_name, "w+") as fp:
            fp.write(chart.to_html())
        return f_dest + f_name


    """
    Variable Groups
    """
    nominal = [x for x in eval_data.columns if "nominal_" in x]
    ordinal = [x for x in eval_data.columns if "ordinal_" in x]
    binary = [x for x in eval_data.columns if "binary_" in x]
    interval = [x for x in eval_data.columns if "interval_" in x]

    bio = [
        "binary__gender",
        "ordinal__age",
    ]
    personal_info = [
        "nominal__marital_status",
        "binary__has_children",
        "nominal__customer_type",
    ]
    prefs = [
        "binary__prefer_home_food",
        "binary__is_foodie",
        "binary__prefer_western_over_chinese",
        "binary__cooks_regularly",
    ]
    distance = [
        "binary__travelled_more_than_15mins_for_offer",
        "binary__travelled_more_than_25mins_for_offer",
        "binary__restuarant_opposite_direction_house",
        "binary__restuarant_same_direction_house",
    ]
    usage_info = [
        "ordinal__no_visited_cold_drinks",
        "ordinal__restaur_spend_less_than20",
        "ordinal__restaur_spend_greater_than20",
        "ordinal__no_take_aways",
        "ordinal__type_of_rest_rating",
        "ordinal__no_visited_bars",
    ]

    biz_info = [
        "nominal__restaurant_type",
        "binary__offer_expiration",
    ]
    dest = [
        "ordinal__dest_distance",
        "interval__travel_time",
    ]
    weather = ["interval__season", "interval__temperature"]
    wealth = [
        "nominal__car",
        "nominal__job_industry",
        "ordinal__income_range",
        "ordinal__qualif",
    ]
    f_groups = {
        "bio": bio,
        "personal_info": personal_info,
        "prefs": prefs,
        "distance": distance,
        "usage_info": usage_info,
        "biz_info": biz_info,
        "dest": dest,
        "weather": weather,
        "wealth": wealth,
    }
    data["pref_profile"] = 0
    eval_data["pref_profile"] = 0


    def pref_profile(row):
        profile_str = ""
        for c in prefs:
            profile_str += str(row[c])
        profile_code = int(profile_str, 2)
        row["pref_profile"] = profile_code
        return row


    data = data.apply(pref_profile, axis=1)
    eval_data = eval_data.apply(pref_profile, axis=1)
    data.eval(
        "biz_type = (nominal__restaurant_type * 2) + (binary__offer_expiration)",
        inplace=True,
    )
    eval_data.eval(
        "biz_type = (nominal__restaurant_type * 2) + (binary__offer_expiration)",
        inplace=True,
    )
    # save_data()
    data.rename(
        {"biz_type": "nominal__biz_type", "pref_profile": "nominal__pref_profile"},
        axis=1,
        inplace=True,
    )
    eval_data.rename(
        {"biz_type": "nominal__biz_type", "pref_profile": "nominal__pref_profile"},
        axis=1,
        inplace=True,
    )

    """
    Spending Distribution
    """

    data["nominal__spend_id"] = 0
    eval_data["nominal__spend_id"] = 0
    id_code = {
        "22": 6,
        "34": 13,
        "23": 7,
        "33": 12,
        "25": 9,
        "13": 2,
        "12": 1,
        "32": 11,
        "55": 24,
        "11": 0,
        "45": 19,
        "24": 8,
        "35": 14,
        "14": 3,
        "15": 4,
        "31": 10,
        "54": 23,
        "43": 17,
        "44": 18,
        "21": 5,
        "42": 16,
        "53": 22,
        "52": 21,
        "51": 20,
        "41": 15,
    }


    def derive_spend_id(row):
        cols = [
            "ordinal__restaur_spend_greater_than20",
            "ordinal__restaur_spend_less_than20",
        ]
        i_cde = ""
        for c in cols:
            i_cde += str(row[c])
        row["nominal__spend_id"] = id_code[i_cde]
        return row


    data = data.apply(derive_spend_id, axis=1)
    eval_data = eval_data.apply(derive_spend_id, axis=1)


    data["nominal__direction"] = 0
    eval_data["nominal__direction"] = 0
    direction_f = [
        "binary__restuarant_opposite_direction_house",
        "binary__restuarant_same_direction_house",
    ]


    def derive_directional_code(row):
        i_cde = ""
        for c in direction_f:
            i_cde += str(row[c])
        row["nominal__direction"] = int(i_cde, 2)
        return row


    data = data.apply(derive_directional_code, axis=1)
    eval_data = eval_data.apply(derive_directional_code, axis=1)
    redundant = [
        "nominal__restaurant_type",
        "binary__offer_expiration",
        "binary__prefer_home_food",
        "binary__is_foodie",
        "binary__prefer_western_over_chinese",
        "binary__cooks_regularly",
        "binary__restuarant_opposite_direction_house",
        "binary__restuarant_same_direction_house",
        "ordinal__restaur_spend_greater_than20",
        "ordinal__restaur_spend_less_than20",
    ]
    extra_effort = [
        "binary__travelled_more_than_15mins_for_offer",
        "binary__travelled_more_than_25mins_for_offer",
    ]
    data["nominal__extra_travel"] = 0
    eval_data["nominal__extra_travel"] = 0


    def summarize_extra_effort(row):
        i_cde = ""
        for c in extra_effort:
            i_cde += str(row[c])
        row["nominal__extra_travel"] = int(i_cde, 2)
        return row


    data = data.apply(summarize_extra_effort, axis=1)
    eval_data = eval_data.apply(summarize_extra_effort, axis=1)

    pd.options.compute.use_numba = False


    def add_cust_type_to_trunc_features(df_, eval_df):
        df = df_.copy()
        cs_type_direc_ = (
            data.groupby(
                ["nominal__customer_type", "nominal__extra_travel", "nominal__direction"]
            )["target"].sum()
            / data.groupby(
                ["nominal__customer_type", "nominal__extra_travel", "nominal__direction"]
            )["target"].count()
        ).sort_values(ascending=False)
        # cs_type_direc.style.bar(axis=0,align='left')
        category_cde = {}
        for i in range(4):
            for j in range(4):
                for k in range(4):
                    try:
                        category_cde[str(i) + str(j) + str(k)] = cs_type_direc_.loc[i, j, k]
                    except KeyError as e:
                        continue
        cs_type_direc = data.groupby(
            ["nominal__customer_type", "nominal__extra_travel", "nominal__direction"]
        )["target"].count()
        category_count = {}
        for i in range(4):
            for j in range(4):
                for k in range(4):
                    try:
                        category_count[str(i) + str(j) + str(k)] = cs_type_direc.loc[
                            i, j, k
                        ]
                    except KeyError as e:
                        continue
        freq = dict(Counter(list(category_cde.values())))
        commn_ = [i for i in freq if freq[i] > 1]
        cmmn_label_groups = {
            cmn: [k for k in category_cde if category_cde[k] == cmn] for cmn in commn_
        }

        fg = 500
        spl_cdes = {}
        for k in cmmn_label_groups:
            cmmn_labels = cmmn_label_groups[k]
            for l in cmmn_labels:
                spl_cdes[l] = str(fg)
            fg += 1

        df["nominal__circumstance"] = 0
        eval_df["nominal__circumstance"] = 0

        def add_ctype_to_gen_f(row):
            cols = ["nominal__customer_type", "nominal__extra_travel", "nominal__direction"]
            icde = ""
            for c in cols:
                icde += str(row[c])
            row["nominal__circumstance"] = icde
            if icde in spl_cdes:
                row["nominal__circumstance"] = spl_cdes[icde]
            return row

        df = df.apply(add_ctype_to_gen_f, axis=1)
        eval_df = eval_df.apply(add_ctype_to_gen_f, axis=1)
        mapped_x = list(
            np.union1d(
                df.nominal__circumstance.unique(), eval_df.nominal__circumstance.unique()
            )
        )
        mapped_x_cde = {x: i for i, x in enumerate(mapped_x)}

        def change_str_to_ord_int(row):
            row["nominal__circumstance"] = mapped_x_cde[row["nominal__circumstance"]]
            return row

        df = df.apply(change_str_to_ord_int, axis=1)
        eval_df = eval_df.apply(change_str_to_ord_int, axis=1)
        return df, eval_df


    data, eval_data = add_cust_type_to_trunc_features(data, eval_data)
    return data,eval_data

In [3]:
def elim_redund(df,eval_df):
    redundant = [
        "nominal__restaurant_type",
        "binary__offer_expiration",
        "binary__prefer_home_food",
        "binary__is_foodie",
        "binary__prefer_western_over_chinese",
        "binary__cooks_regularly",
        "binary__restuarant_opposite_direction_house",
        "binary__restuarant_same_direction_house",
        "ordinal__restaur_spend_greater_than20",
        "ordinal__restaur_spend_less_than20",
        "binary__travelled_more_than_15mins_for_offer",
        "binary__travelled_more_than_25mins_for_offer",
        "nominal__customer_type",
    ]
    nr_data = df.drop(redundant, axis=1)
    nr_eval_data = eval_df.drop(redundant, axis=1)
    return redundant,nr_data,nr_eval_data
