In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import RobustScaler
from catboost import CatBoostClassifier
from itertools import combinations
import matplotlib.pyplot as plt
from collections import Counter
from tqdm.notebook import tqdm
import seaborn as sns
import pandas as pd
import numpy as np
import logging
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

plt.rcParams["figure.figsize"] = (15,7)
plt.style.use("ggplot")

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def get_logger(name="kaggle_logger", level="INFO"):
    """
    params:
        name: name of the logger to be created
        level: default logging level
    return:
        logger with the specified name with format as: time - name - level - message
    """
    logger_ = logging.getLogger(name)
    logger_.setLevel(level)
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)

    logger_.addHandler(console_handler)
    formatter = logging.Formatter('%(asctime)s  %(name)s  %(levelname)s: %(message)s')
    console_handler.setFormatter(formatter)
    return logger_

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv")

In [None]:
x_train, x_rem, y_train, y_rem = train_test_split(train_df.drop(["id","target"], axis=1), train_df.target, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_rem, y_rem, test_size=0.3)

x_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
x_val.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

In [None]:
logger = get_logger()

In [None]:
base_continous_features = ['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_19', 'f_20',
                      'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28']

base_categorical_features = ['f_09', 'f_13', 'f_30', 'f_15', 'f_27', 'f_07', 'f_16', 'f_18',
                           'f_14', 'f_29', 'f_10', 'f_12', 'f_17', 'f_11', 'f_08']

In [None]:
def feature_extraction(data, continous_features, categorical_features, string_feature_name, scaler=None):
    
    logger.info("Adding String Features")
    # Adding String Features i.e count of each alphabet in the string feature (categorical)
    string_features_df = pd.DataFrame(data[string_feature_name].apply(lambda x: dict(Counter(list(x)))).values.tolist()).fillna(0)
    data[string_features_df.columns] = string_features_df
    categorical_features += list(string_features_df.columns)
    
    logger.info("Scaling Real Value Features")
    # Scaling the featuers for interactions
    if scaler is None:
        scaler = RobustScaler()
        scaler.fit(data[continous_features])
    data[continous_features] = pd.DataFrame(scaler.transform(data[continous_features]), columns=continous_features)
    logger.info("Adding multiplication sign feature")
    
    all_real_value_combination = list(combinations(continous_features,2))
    for combination in tqdm(all_real_value_combination):
        column_name_to_add = f"{combination[0]}_{combination[1]}"
        data.insert(data.shape[1], column_name_to_add, (data[combination[0]]*data[combination[1]] > 0).astype(int) )
        categorical_features.append(column_name_to_add)
        
    type_changing_for_categorical_features = {i:int for i in categorical_features}
    type_changing_for_categorical_features["f_27"] = str

    data = data.astype(type_changing_for_categorical_features)
    return data, continous_features, categorical_features, scaler

In [None]:
train_extracted_features, continous_features, categorical_features, train_scaler = feature_extraction(x_train.copy(),
                                                                                                      base_continous_features.copy(),
                                                                                                      base_categorical_features.copy(),
                                                                                                      "f_27")

In [None]:
val_extracted_features, _, _, _ = feature_extraction(x_val.copy(), base_continous_features.copy(),
                                                     base_categorical_features.copy(), "f_27", scaler=train_scaler)

In [None]:
test_extracted_features, _, _, _ = feature_extraction(x_test.copy(), base_continous_features.copy(),
                                                     base_categorical_features.copy(), "f_27", scaler=train_scaler)

In [None]:
all_train_extracted_features, _, _, all_train_scaler = feature_extraction(train_df.drop(["id","target"], axis=1).copy(), base_continous_features.copy(),
                                                           base_categorical_features.copy(), "f_27")

In [None]:
test_prediction_extracted_features, _, _, _ = feature_extraction(test_df.drop(["id"], axis=1).copy(), base_continous_features.copy(),
                                                                 base_categorical_features.copy(), "f_27", scaler=all_train_scaler)

In [None]:
(
set(train_extracted_features.columns) - set(val_extracted_features.columns),
set(train_extracted_features.columns) - set(test_extracted_features.columns),
set(train_extracted_features.columns) - set(all_train_extracted_features.columns),
set(train_extracted_features.columns) - set(test_prediction_extracted_features.columns)
)

In [None]:
all_features = ['f_13','f_00_f_01', 'f_01_f_28', 'f_04_f_28', 'P', 'Q', 'f_03_f_28',
 'f_19_f_22', 'H', 'f_18', 'f_12', 'f_16', 'R', 'f_17', 'S', 'f_22_f_24', 'f_09',
 'f_15', 'f_10', 'f_08', 'f_02_f_28', 'G', 'T', 'f_11', 'f_14', 'f_06', 'f_04',
 'f_07', 'F', 'f_03', 'E', 'D', 'C', 'f_29', 'B', 'A', 'f_05', 'f_01', 'f_20', 'f_00',
 'f_02', 'f_23', 'f_28', 'f_19', 'f_24', 'f_25', 'f_22', 'f_30', 'f_21','f_26']

classification_features = ['G', 'f_09', 'f_12', 'f_13', 'f_30', 'R', 'f_18', 'H', 'f_15', 'f_19_f_22', 'F',
                           'f_10', 'f_08', 'f_03_f_28', 'T', 'f_01_f_28', 'Q', 'f_11', 'D', 'f_02_f_28', 'E',
                           'f_14', 'P', 'f_22_f_24', 'f_07', 'A', 'f_16', 'B', 'S', 'f_17', 'C', 'f_29',
                           'f_04_f_28', 'f_00_f_01']

In [None]:
model = CatBoostClassifier(
    cat_features=classification_features,
    n_estimators=5079,
    learning_rate=0.1179458978321592,
    depth=7,
    l2_leaf_reg=4,
    task_type="GPU",
    verbose=1000)

model.fit(all_train_extracted_features[all_features], train_df.target)

In [None]:
prediction = model.predict_proba(test_prediction_extracted_features[all_features])[:,1]

In [None]:
prediction_df = test_df[["id"]]
prediction_df["target"] = prediction

In [None]:
prediction_df.to_csv("submission.csv", index=False)