# Imports

In [None]:
import seaborn as sns
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', None)

# Load dataset

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

# General overview

In [None]:
train.head(5)

In [None]:
train.info(verbose=True)

In [None]:
train.describe()

In [None]:
def nan_counts(df):
    null_counts = df.isna().sum()
    info_df = pd.DataFrame(list(zip(null_counts.index,null_counts.values))\
                                         , columns = ['Column', 'nan_counts'])
    data_types = df.dtypes
    info_df['Dtype'] = data_types.values
    return info_df.T

In [None]:
nan_counts(train)

# Feature distribution

In [None]:
def plot_feature_distribution(train, test):
    fig = plt.figure(figsize = (15, 71))
    cols = train.columns.tolist()[1:119]
    for i in cols:
        plt.subplot(24,5,cols.index(i)+1)
        sns.set_style("white")
        plt.title(i, size = 12, fontname = 'monospace')
        a = sns.kdeplot(train[i], color = '#f9ba32', linewidth = 1.3)
        sns.kdeplot(test[i], color = '#426e86', linewidth = 1.3)
        plt.ylabel('')
        plt.xlabel('')
        plt.xticks(fontname = 'monospace')
        plt.yticks([])
        for j in ['right', 'left', 'top']:
            a.spines[j].set_visible(False)
            a.spines['bottom'].set_linewidth(1.2)

    fig.tight_layout(h_pad = 3)

    plt.figtext(0.335, 1.02, 'Distribution of features', color = '#2f3131', fontname = 'monospace', size = 25)
    plt.figtext(0.3, 1.01, 'train', color = '#f9ba32', fontname = 'monospace', size = 18)
    plt.figtext(0.66, 1.01, 'test', color = '#426e86', fontname = 'monospace', size = 18)

    plt.show()

In [None]:
plot_feature_distribution(train, test)

# Target distribution

In [None]:
train.claim.value_counts().plot(kind='pie', figsize=(10, 10), autopct='%1.1f%%')