In [1]:
import gc

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

In [2]:
train = pd.read_parquet('./data/train.parquet')
train_labels = pd.read_csv('./data/train_labels.csv')

train.shape, train_labels.shape

((5531451, 190), (458913, 2))

In [3]:
train = train.merge(train_labels, left_on='customer_ID', right_on='customer_ID')
print(train.shape)
del train_labels
gc.collect()

(5531451, 191)


0

In [4]:
train['S_2'] = pd.to_datetime(train['S_2'])

In [5]:
categorical_features = [
    'B_30', 'B_38', 'D_63', 'D_64', 'D_66', 'D_68',
    'D_114', 'D_116', 'D_117', 'D_120', 'D_126'
]
train[categorical_features] = train[categorical_features].astype('category')
train[categorical_features].dtypes

B_30     category
B_38     category
D_63     category
D_64     category
D_66     category
D_68     category
D_114    category
D_116    category
D_117    category
D_120    category
D_126    category
dtype: object

In [6]:
print(f'Train dates from {train["S_2"].min()} to {train["S_2"].max()}')

Train dates from 2017-03-01 00:00:00 to 2018-03-31 00:00:00


In [7]:
missing_data = train.isnull().sum().div(len(train)).mul(100).sort_values(ascending=False)

In [8]:
plt.style.use('Solarize_Light2')
fig, ax = plt.subplots(len(train) // 50 + 1, 1, figsize=(25, 10))
for i in range(len(train) // 50 + 1):
    sns.barplot(
        x=missing_data[i * 50:(i + 1) * 50].index,
        y=missing_data[i * 50:(i + 1) * 50].values,
        ax=ax[i]
    )
    ax[i].set_ylabel('Percentage [%]')
    ax[i].tick_params(axis='x', rotation=90)
plt.suptitle('Amout of missing data')
plt.tight_layout()
plt.show()

KeyboardInterrupt: 