In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = "darkgrid")
%matplotlib inline
import gc

### reduce_mem_usage function has been taken from https://www.kaggle.com/questions-and-answers/148011

#### below function makes our dataframe memory efficient.

In [None]:
def reduce_mem_usage(train_data):
    """ iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = train_data.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in train_data.columns:
        col_type = train_data[col].dtype

        if col_type != object:
            c_min = train_data[col].min()
            c_max = train_data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    train_data[col] = train_data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    train_data[col] = train_data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    train_data[col] = train_data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    train_data[col] = train_data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    train_data[col] = train_data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    train_data[col] = train_data[col].astype(np.float32)
                else:
                    train_data[col] = train_data[col].astype(np.float64)
        else:
            train_data[col] = train_data[col].astype('category')

    end_mem = train_data.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return train_data

In [None]:
sample = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")
train  = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")

sample = reduce_mem_usage(sample)
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

gc.collect()

In [None]:
train.head(5)

In [None]:
test.head(5)

In [None]:
sample.head(5)

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
print(train.shape, test.shape, sample.shape)

### Here we look at no of unique values in each column and it's data type.

In [None]:
list(zip(train.columns, train.dtypes, train.nunique()))

### We check for if there is any null value in train or test set.

In [None]:
train.isnull().sum().sum(), test.isnull().sum().sum()  # No null in train and test datasets

In [None]:
# 'f22','f43','f242'-'f284'  each have only two value so we will use it as categorical features
cat_feat = ['f22','f43']
for i in range(242,285):
    cat_feat.append(f"f{i}")
print(cat_feat)

In [None]:
train[cat_feat].describe()

In [None]:
# for col in cat_feat:
#     print(col)
#     print(train[col].value_counts(normalize=True))
#     print("_"*40)

Since we have have huge dataset so we take only sample of 10000 and since the sample is random so it represents our original dataset.

In [None]:
sample_train = train.sample(10000)
sample_test = test.sample(10000)
del train
del test
del sample
gc.collect()

We look at the distribution of categorical features in our training dataset

In [None]:
sample_train[cat_feat].hist(figsize=(20,20))
plt.show()
plt.tight_layout()

Now we look at distribution of numerical features.

In [None]:
sample_train.drop(cat_feat, axis=1).hist(figsize=(50,50))
plt.show()
plt.tight_layout()

Here we see that our target class is balanced.

In [None]:
sns.countplot(x=sample_train.target)
print(sample_train.target.value_counts())

In [None]:
num_corr = sample_train.drop(cat_feat+['id'], axis=1)
mask = np.triu(np.ones_like(num_corr, dtype = bool))
plt.figure(figsize=(20,16))
sns.heatmap(num_corr, mask = mask, cmap='magma')

In [None]:
len(cat_feat)

## Now we compare the distribution of train and test set to see if they have same distribution or not.

### For categorical features train and test set have almost same distribution.

In [None]:
fig, axes = plt.subplots(15,3, figsize=(30,90))
axes = axes.flatten()
for idx, ax in enumerate(axes):
        sns.kdeplot(sample_train[cat_feat[idx]], color="red", label="train", ax=ax)
        sns.kdeplot(sample_test[cat_feat[idx]],  color="green", label="test", ax=ax)
        ax.get_yaxis().set_visible(False)
        ax.legend()
fig.suptitle("distribution of train-test cat_feat")
fig.tight_layout()
plt.show()

In [None]:
num_feat = list(set(sample_train.columns)- set(cat_feat) - set(['id','target']))
print(num_feat,len(num_feat))

### For numerical features also distribution is almost same for both train and test set.

In [None]:
fig, axes = plt.subplots(80,3, figsize=(30,500))
axes = axes.flatten()
for idx, ax in enumerate(axes):
        sns.kdeplot(sample_train[num_feat[idx]], color="red", label="train", ax=ax)
        sns.kdeplot(sample_test[num_feat[idx]],  color="green", label="test", ax=ax)
        ax.get_yaxis().set_visible(False)
        ax.set_title(f'f{num_feat[idx]}', loc = 'right', fontsize = 12)
        ax.legend()
fig.suptitle("distribution of train-test num_feat")
fig.tight_layout()
plt.show()

## Thanks for sticking with me till end, I hope it helped you get some more insight of the data.