In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

import sympy

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')

float_col = [c for c in train.columns if train[c].dtype==np.float]

In [None]:
# Base FE
def split_txt(df):
    df_tmp = df.copy()
    split_df = df_tmp['f_27'].str.split('', expand=True).iloc[:,1:11]
    split_df.columns = [f'f_27_{i}' for i in range(10)]
    df_tmp = pd.concat([df_tmp, split_df], axis=1)
    return df_tmp

# Reference : features from https://www.kaggle.com/code/cabaxiom/tps-may-22-eda-lgbm-model
def n_unique(row):
    unique_count = len(set(row["f_27"]))
    return unique_count

# train
train = split_txt(train)
train["unique_characters"] = train.apply(n_unique, axis=1)
cat_cols = [f'f_27_{i}' for i in range(10)]
train_x = train.drop(['id', 'target', 'f_27'],axis=1)
train_y = train.target

# test
test = split_txt(test)
test['unique_characters'] = test.apply(n_unique, axis=1)
test = test.drop(['id', 'f_27'], axis=1)
test.loc[test.f_27_1 == 'O', 'f_27_1'] = 'B'
test.loc[test.f_27_4 == 'N', 'f_27_4'] = 'B'

# label encoding
for c in cat_cols:
    le = LabelEncoder()
    le.fit(train_x[c])
    train_x[c] = le.transform(train_x[c])
    test[c] = le.transform(test[c])

# standard scaler
scaler = StandardScaler()
scaler.fit(train_x)
train_x = pd.DataFrame(scaler.transform(train_x), columns=train_x.columns)
test = pd.DataFrame(scaler.transform(test), columns=test.columns)

# Generate new features from interactions
Interaction of functions can be found here  
https://www.kaggle.com/code/kotrying/tps22-05?scriptVersionId=95761213

In [None]:
# scatter plot
def plot_scatter(df, fx, fy, c='blue'):
    plt.figure(figsize=(5, 5))
    plt.grid(which = "major", axis = "x", color = "black", alpha = 0.8,
        linestyle = "--", linewidth = 0.5)
    plt.grid(which = "major", axis = "y", color = "black", alpha = 0.8,
        linestyle = "--", linewidth = 0.5)

    plt.scatter(df[fx], df[fy], c=c, s=1)
    
# for detail
def plot_detail(df, fx, fy, x1, x2, y1, y2):
    data = df[(df[fx] >= x1)&(df[fx] < x2)&
                (df[fy] >= x1)&(df[fy] < x2)]

    plt.figure(figsize=(5, 5))
#     plt.plot([x1, x2], [y1, y2], color='red')
    plt.scatter(data[fx], data[fy], c=data.target, s=1)

# calculate coordinates
def clc_coordinate(x1, x2, y1, y2):
    a = sympy.Symbol('a')
    b = sympy.Symbol('b')
    ex1 = a*x1 + b - y1
    ex2 = a*x2 + b - y2
    s = sympy.solve([ex1,ex2])
    return s[a], s[b]

# Linear class
class Linear():
    def __init__(self, a, b):
        self.a = a
        self.b = b
    def clc(self, x):
        return self.a*x + self.b

# split data with lines
def classify(x, y, y1, y2):
    if  y > y1.clc(x):
        return 1
    elif  y > y2.clc(x):
        return 0
    else:return -1
    
# make linear objects and split data   
def mk_class(df, fx, fy, a1, b1, a2, b2):
    y1 = Linear(a1, b1)
    y2 = Linear(a2, b2)
    new_col = df.apply(lambda x: classify(x=x[fx], y=x[fy], y1=y1, y2=y2), axis=1)
    return new_col

# find the optimal lines
def plot_linear(df, fx, fy, lin1, lin2, plot_mode=True):
    a1, b1 = clc_coordinate(lin1[0], lin1[1], lin1[2], lin1[3])
    a2, b2 = clc_coordinate(lin2[0], lin2[1], lin2[2], lin2[3])
    if plot_mode==True:
        plot_scatter(df, fx, fy, c=df.target)
        plt.plot([5, -5], [5*a1+b1, -5*a1+b1], color='red')
        plt.plot([5, -5], [5*a2+b2, -5*a2+b2], color='red')
    return a1, b1, a2, b2

# find the optimal lines and make a new feature
def plot_mk(df, fx, fy, lin1, lin2, mk_mode=False, plot_mode=True):
    a1, b1, a2, b2 = plot_linear(df, fx, fy, lin1, lin2, plot_mode=plot_mode)
    if mk_mode==True:
        print('make new feature...')
        new_feat = mk_class(df, fx, fy, a1, b1, a2, b2)
        if plot_mode==True:plot_scatter(df, fx, fy, new_feat)
        return new_feat
    else:pass

In [None]:
sample_df = train_x.copy()
sample_df['target'] = train_y
sample_df.head(3)

**Example**  
interaction between f_02 and f_21

**scatter  plot**

In [None]:
%%time
fx = 'f_02'
fy = 'f_21'

plot_scatter(sample_df, fx, fy, c=sample_df.target)

**plot detail**

In [None]:
%%time
x1 = -3
x2 = 0
y1 = -3
y2 = 0

plot_detail(sample_df, fx, fy, x1, x2, y1, y2)

In [None]:
%%time
x1 = 0
x2 = 3
y1 = 0
y2 = 3

plot_detail(sample_df, fx, fy, x1, x2, y1, y2)

**find the optimal lines**

In [None]:
%%time
# we need tune the params
lin1 = (5, -5, 0.15, 4.15)
lin2 = (5, -5, -3.97, -0.13)

f_02_21 = plot_mk(sample_df, fx, fy, lin1, lin2, mk_mode=False)

**make a new feature**

In [None]:
%%time
f_02_21 = plot_mk(sample_df, fx, fy, lin1, lin2, mk_mode=True, plot_mode=True)

In [None]:
f_02_21.value_counts()

**All you have to do**

In [None]:
%%time
fx = 'f_02'
fy = 'f_21'
lin1 = (5, -5, 0.15, 4.15)
lin2 = (5, -5, -3.97, -0.13)

f_02_21 = plot_mk(sample_df, fx, fy, lin1, lin2, mk_mode=True, plot_mode=False)
tf_02_21 = plot_mk(test, fx, fy, lin1, lin2, mk_mode=True, plot_mode=False)