In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = "darkgrid")
%matplotlib inline
import gc

import time
import warnings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')

### reduce_mem_usage function has been taken from https://www.kaggle.com/questions-and-answers/148011

#### below function makes our dataframe memory efficient.

In [None]:
def reduce_mem_usage(train_data):
    """ iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = train_data.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in train_data.columns:
        col_type = train_data[col].dtype

        if col_type != object:
            c_min = train_data[col].min()
            c_max = train_data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    train_data[col] = train_data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    train_data[col] = train_data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    train_data[col] = train_data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    train_data[col] = train_data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    train_data[col] = train_data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    train_data[col] = train_data[col].astype(np.float32)
                else:
                    train_data[col] = train_data[col].astype(np.float64)
        else:
            train_data[col] = train_data[col].astype('category')

    end_mem = train_data.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return train_data

In [None]:
sample = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")
train  = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv")

sample = reduce_mem_usage(sample)
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

gc.collect()

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
sample.head(3)

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
print(train.shape, test.shape, sample.shape)

### We check for if there is any null value in train or test set.

In [None]:
train.isnull().sum().sum(), test.isnull().sum().sum()  # No null in train and test datasets

### Here we look at no of unique values in each column and it's data type.

In [None]:
all_features = list(test.columns)[1:]
print(all_features)

In [None]:
df = pd.concat([train[all_features], test[all_features]], axis=0)
list(zip(df.columns, df.dtypes, df.nunique()))

In [None]:
# ['A0T0G0C10', 'A0T0G1C9', 'A0T0G9C1', 'A0T0G10C0', 'A0T1G0C9', 'A0T10G0C0', 'A1T0G9C0', 'A10T0G0C0']  each has less than 25 different values so we will use it as categorical features
cat_features = [i for i in all_features if df[i].nunique() <25]
num_features = [j for j in all_features if df[j].nunique() >= 25]
print(cat_features)
print()
print(num_features)

In [None]:
print(len(cat_features),len(num_features))

In [None]:
df[cat_features].describe()

In [None]:
# for col in cat_feat:
#     print(col)
#     print(train[col].value_counts(normalize=True))
#     print("_"*40)

Since we have have huge dataset so we take only sample of 10000 and since the sample is random so it represents our original dataset.

In [None]:
sample_train = train.sample(10000)
sample_test = test.sample(10000)
del train
del test
del sample
del df
gc.collect()

We look at the distribution of categorical features in our training dataset

In [None]:
sample_train[cat_features].hist(figsize=(20,20))
plt.show()
plt.tight_layout()

Now we look at distribution of numerical features.

In [None]:
sample_train.drop(cat_features+["row_id"], axis=1).hist(figsize=(50,50))
plt.show()
plt.tight_layout()

## Here we see that our target class is balanced.

In [None]:
target_classes = list(set(sample_train.target))
target_classes

## So classes are almost balanced

In [None]:
ax= sns.countplot(y=sample_train.target)
print(sample_train.target.value_counts()/sample_train.shape[0])

In [None]:
sample_train[num_features].head(2)

In [None]:
num_corr = sample_train[num_features]
mask = np.triu(np.ones_like(num_corr, dtype = bool))
plt.figure(figsize=(20,16))
sns.heatmap(num_corr, mask = mask, cmap='magma')

## Now we compare the distribution of train and test set to see if they have same distribution or not.

In [None]:
print(len(cat_features),len(num_features))

In [None]:
sample_train.head(2)

In [None]:
sample_test.head(2)

### For categorical features  distribution is not same for train and test set.

In [None]:
fig, axes = plt.subplots(4,2, figsize=(10,10))
axes = axes.flatten()
for idx, ax in enumerate(axes):
    sns.kdeplot(sample_train[cat_features[idx]], color="red", label="train", ax=ax)
    sns.kdeplot(sample_test[cat_features[idx]],  color="green", label="test", ax=ax)
    ax.get_yaxis().set_visible(False)
    ax.legend()
fig.suptitle("distribution of train-test cat_feat")
fig.tight_layout()
plt.show()

### For numerical features  distribution is almost same for both train and test set except for few columns.

In [None]:
fig, axes = plt.subplots(139,2, figsize=(20,800))
axes = axes.flatten()
for idx, ax in enumerate(axes):
        sns.kdeplot(sample_train[num_features[idx]], color="red", label="train", ax=ax)
        sns.kdeplot(sample_test[num_features[idx]],  color="green", label="test", ax=ax)
        ax.get_yaxis().set_visible(False)
        ax.set_title(f'f{num_features[idx]}', loc = 'right', fontsize = 12)
        ax.legend()
fig.suptitle("distribution of train-test num_feat")
fig.tight_layout()
plt.show()