In [None]:
from datetime import datetime
    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col="row_id")
test_df = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')

# Unique values by columns

In [None]:
unique = train_df.nunique()
print(f'Max: {unique.max()}')
print(unique)

In [None]:
train_df['A0T0G1C9'].unique()

In [None]:
from math import factorial

def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

features_columns = [e for e in train_df.columns if e != 'row_id' and e != 'target']

train_i = pd.DataFrame({col: ((train_df[col] + bias_of(col)) * 1000000).round().astype(int)
                        for col in features_columns})
test_i = pd.DataFrame({col: ((test_df[col] + bias_of(col)) * 1000000).round().astype(int)
                       for col in features_columns})
train_i.head()

# Add additional feature to dataset

In [None]:
train_df['gcd'] = np.gcd.reduce(train_i[features_columns], axis=1)
test_df['gcd'] = np.gcd.reduce(test_i[features_columns], axis=1)
test_df['gcd'].unique() # We may want to create four separate classifiers for the four GCD values

In [None]:
train_df.head()

# Drop duplicates

In [None]:
duplicates_train = train_df.duplicated().sum()
print('Train data shape:', train_df.shape)
print('Duplicates in train data: {0}'.format(duplicates_train))

# inplace=True!!!
train_df.drop_duplicates(keep='first', inplace=True)
duplicates_train = train_df.duplicated().sum()
print('============== After drop_duplicates ==============')
print('Train data shape:', train_df.shape)
print('Duplicates in train data: {0}'.format(duplicates_train))

In [None]:
train_df.head()

# Exploratory Data Analysis

# Common dataset info

Let's look at the information on the columns. The featured columns are of the float64 type.

In [None]:
train_df.info(verbose=True)

In [None]:
print(f'\033[92mNumber of rows in train data: {train_df.shape[0]}')
print(f'\033[94mNumber of columns in train data: {train_df.shape[1]}')
print(f'\033[93mNumber of values in train data: {train_df.count().sum()}')
print(f'\033[91mNumber missing values in train data: {sum(train_df.isna().sum())}')

Get features columns:

In [None]:
features_columns = [e for e in train_df.columns if e != 'row_id' and e != 'target']

Training dataset statistics

In [None]:
train_df[features_columns].describe().T.sort_values(by='std' , ascending = False)\
                     .style.background_gradient(cmap='GnBu')\
                     .bar(subset=["max"], color='#F8766D')\
                     .bar(subset=["mean",], color='#00BFC4')

Deviations are small, so standardization of features is not necessary.

# Target distribution and class balancing

Target variable allocation and class balancing

In [None]:
target_distrib = pd.DataFrame({
    'count': train_df['target'].value_counts(),
    'share, %': train_df['target'].value_counts() / train_df.shape[0] * 100
})

target_distrib.sort_index()

In [None]:
col_series = train_df['target'].value_counts()
plt.xticks(rotation='vertical')
plt.bar(col_series.keys(), col_series.values, 0.9)
None

# Detect target with min weight

In [None]:
from sklearn.utils import compute_sample_weight

balanced_train_df = pd.DataFrame({'target': train_df['target']})
balanced_train_df['sample_weight'] = compute_sample_weight(class_weight='balanced', y=balanced_train_df['target'])

max_sample_weight = np.min(balanced_train_df['sample_weight'])
max_target = balanced_train_df[balanced_train_df['sample_weight'] == max_sample_weight][:1]['target'].to_numpy()[0]

print(f'Target with min weight is {max_target} ({max_sample_weight})')

In [None]:
unique_targets = train_df['target'].unique()

for target in unique_targets:
    v = balanced_train_df[balanced_train_df['target'] == target][:1]['sample_weight'].to_numpy()[0]
    print(f'{target}: {v}')

In [None]:
value_counts = train_df['target'].value_counts()
max_class_count = value_counts[max_target]

for target in unique_targets:
    if target != max_target:
        target_value_counts = max_class_count - value_counts[target]
        target_class_sample = train_df[train_df['target'] == target].sample(target_value_counts)

        train_df = pd.concat([target_class_sample, train_df], axis=0)

In [None]:
target_distrib = pd.DataFrame({
    'count': train_df['target'].value_counts(),
    'share, %': train_df['target'].value_counts() / train_df.shape[0] * 100
})

target_distrib.sort_index()

The data is well balanced by class.

# Missing values

Let's look at the missing values.

More: https://pandas.pydata.org/docs/user_guide/missing_data.html

In [None]:
print('Train set - missing values: \t', train_df.isnull().sum().sum())
print('Test set - missing values: \t', test_df.isnull().sum().sum())

non_na = train_df.count().sort_values(ascending=True)
total_row = train_df.shape[0]
i = 0
for item, value in non_na.items():
    if value < total_row:
        i+=1
        print(f'{item}: {total_row - value}')
print(f'Found {i} column with NA values')

There are no columns with missing values

# Feature correlation

Let's look at the correlation of features.

In [None]:
corr = train_df[features_columns].corr()

f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=bool),
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [None]:
threshold = 0.8

corr_pairs = (
    corr[abs(corr) > threshold][corr != 1.0]
).unstack().dropna().to_dict()

unique_corr_pairs = pd.DataFrame(
    list(
        set([(tuple(sorted(key)), corr_pairs[key]) for key in corr_pairs])
    ), columns=['pair', 'corr']
)

unique_corr_pairs

There is a correlation of some features.

# Numerical and categorical features

In [None]:
cat_threshold = 20
df = pd.concat([train_df[features_columns], test_df[features_columns]], axis=0)

cat_features = [col for col in features_columns if df[col].nunique() < cat_threshold]
num_features = [col for col in features_columns if df[col].nunique() >= cat_threshold]

del df
print(f'Total number of features: {len(features_columns)}')
print(f'\033[92mNumber of categorical (<{cat_threshold} Unique Values) features: {len(cat_features)}')
print(f'\033[96mNumber of continuos features: {len(num_features)}')

cat_features

Coding of categorical features is not necessary. On other data, this may be required.

In [None]:
train_df.to_csv("train.csv", index=True)
test_df.to_csv("test.csv", index=True)