In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#charts
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings( 'ignore' )

plt.style.use("fivethirtyeight")
plt.rcParams["figure.figsize"] = (15,7)

In [None]:
# Thanks to : https://www.kaggle.com/aantonova/some-new-risk-and-clusters-features
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = reduce_mem_usage(pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv"))
test = reduce_mem_usage(pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv"))
sub = reduce_mem_usage(pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv"))

In [None]:
train.head()

In [None]:
train.shape, test.shape

In [None]:
train.isna().sum()

In [None]:
train.dtypes

# **DATA EXPLORATORY ANALYSIS**

In [None]:
p_target = train['target'].value_counts() / len(train)
p_target.plot(kind='bar');
plt.xlabel('TARGET');
plt.ylabel('%');

**WE HAVE A BALANCED CLASS**

In [None]:
feat_12 = ["f_0" + str(i) for i in range(7,10)]
feat_13 = ["f_" + str(i) for i in range(10,19)]
feat_1 = feat_12 + feat_13
feat_1.append('id')

feat_2 = list(train.columns)
for f in feat_1:
    feat_2.remove(f)


feat_2.remove('target')

In [None]:
feat_1.remove('id');
plt.figure(figsize=(18,15));
for i, feat in enumerate(train[feat_1]):
    if train[feat].dtypes != 'object':
        plt.subplot( 4, 3, i + 1 );
        sns.kdeplot(x=train[feat], hue=train['target'],  alpha=0.5,  multiple="stack", shade=True)
        plt.xlabel(feat);
        plt.suptitle('feat');
        plt.tight_layout();

In [None]:
 plt.figure(figsize=(18,15));
for i, feat in enumerate(train[feat_2]):
    if train[feat].dtypes != 'object':
        plt.subplot( 7, 3, i + 1 );
        sns.kdeplot(x=train[feat], hue=train['target'],  alpha=0.5,  multiple="stack", shade=True)
        plt.xlabel(feat);
        plt.tight_layout();

In [None]:
feat_a = feat_1 + feat_2
feat_a.remove('f_27')
feat_a.append('target')
plt.subplots(figsize=(30, 13))
sns.heatmap(train[feat_a].corr(),annot=True, fmt=".2f", annot_kws={"size": 12}, cbar_kws={"shrink": .2},vmin=-0.2 ,vmax=1)
plt.show();

# **FEATURE ENGENEREEING**

In [None]:
def count_letters(df: pd.DataFrame):
    df['f_27'] = df['f_27'].str.upper()
    df['length'] = df['f_27'].str.len()
    letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    for char in letters:
        df[char] = df['f_27'].str.count(char)
    #dropping with 0 ocourrency
        if df[char].sum() == 0:
            df = df.drop([char], axis=1)
    return df


def drop_cols(df: pd.DataFrame):
    cols = ['f_27','length']
    df.drop(cols, axis=1, inplace=True)
    return df

In [None]:
train = (train.pipe(count_letters).pipe(drop_cols))
test = (test.pipe(count_letters).pipe(drop_cols))

In [None]:
test.shape, train.shape

In [None]:
train.head()

In [None]:
plt.subplots(figsize=(18,15))

for i, col in enumerate(train.loc[:,'A':'T']):
    plt.subplot(5,4, i+1)
    sns.countplot(x=train[col], hue=train['target'])
    plt.tight_layout();