In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# EDA with tabular data
## Starting analysis
#### Read data and print some basic information

In [None]:
raw_data = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
raw_data.isna().sum().sum()

#### Good sign! There is no null data! 

In [None]:
print(raw_data.dtypes)

#### `f_27` has string type. Maybe it is categorical data?

In [None]:
raw_data.nunique()

#### Not seems so... But what I discovered from discussions, it can be some sort of sequential process. Later I will try to transoform this column to categorical variables

# Feature engineering
#### Create a numerical variables from `f_27` column. Firstly, get all unique letters from this column

In [None]:
unique_chars = set()
raw_data.f_27.apply(lambda x: unique_chars.update(set(x)))
unique_chars = list(unique_chars)
print(unique_chars)

#### Then let's split string into separated characters and encode them

In [None]:
def feature_eng(df):
    enc = OrdinalEncoder()
    enc.fit(np.reshape(unique_chars, (-1, 1)))
    df_copy = df.copy()
    for i in range(10):
        df_copy['f_27_{}'.format(i+1)] = enc.transform(df_copy.f_27.str.get(i).values.reshape(-1, 1))
    return df_copy.drop(columns=['f_27', 'target', 'id'])

In [None]:
fe_data = feature_eng(raw_data)
fe_data.head()

#### Create categorical and numerical dataframes

In [None]:
temp = (fe_data.nunique() < 21)
categorical = list(temp[temp].index)
numerical = fe_data.columns.difference(categorical)

# Standardize the data

In [None]:
scaler = ColumnTransformer([
    ('num', MinMaxScaler(), numerical),
    ('cat', 'passthrough', categorical)
])
scaled_data = pd.DataFrame(scaler.fit_transform(fe_data), columns=[*numerical, *categorical])

In [None]:
scaled_data.describe()

# Histograms

In [None]:
n_rows, n_cols = 6, 4
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, 10))
fig.suptitle('Categorical variables', fontsize=20)
fig.tight_layout()
plt.subplots_adjust(wspace=0.6, hspace=0.6)
for i, column in enumerate(scaled_data[categorical].columns):
    sns.histplot(scaled_data[column],ax=axes[i//n_cols,i%n_cols], discrete=True)
    axes[i//n_cols,i%n_cols].xaxis.set_label_position('top')
    axes[i//n_cols,i%n_cols].set(ylabel=None)

In [None]:
n_rows, n_cols = 4, 4
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, 10))
fig.suptitle('Numerical variables', fontsize=20)
fig.tight_layout()
plt.subplots_adjust(wspace=0.6, hspace=0.6)
for i, column in enumerate(scaled_data[numerical].columns):
    sns.histplot(scaled_data[column],ax=axes[i//n_cols,i%n_cols])
    axes[i//n_cols,i%n_cols].xaxis.set_label_position('top')
    axes[i//n_cols,i%n_cols].set(ylabel=None)

In [None]:
numerical

# Correlations

In [None]:
data_corr = scaled_data.corr()
f = plt.figure(figsize=(11, 9))
sns.set_theme(style='white')
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(data_corr, cmap=cmap, linewidth=.5, square=True, vmax=1.)
f.show()

#### There is no highly correlated variables in dataset, so we can use linear regression models.