In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import gc
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import StandardScaler, MinMaxScaler
!pip install pandarallel -q

In [None]:
large_train_df = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv', index_col='id')
#test_df = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col='id')
#submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv', index_col='id')

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train_df = reduce_mem_usage(large_train_df)
#test_df = reduce_mem_usage(test_df)

In [None]:
scaler = StandardScaler()
scaler.fit(train_df)
train_df = pd.DataFrame(scaler.transform(train_df), index=train_df.index, columns=train_df.columns)

In [None]:
gc.collect()

In [None]:
print(train_df.shape)
train_df.head()

In [None]:
print(test_df.shape)
test_df.head()

In [None]:
x = train_df.drop('claim', axis=1)
y = train_df['claim']

In [None]:
pos = y > 0

In [None]:
feature = x.sem(axis=1)
split = pd.concat([feature[pos], feature[~pos]], axis=1)
sns.displot(split, bins=30, alpha=0.5).set(title='Standard mean error of Training Set)')

In [None]:
feature = x.min(axis=1)
feature_abs = x.abs().min(axis=1)
split = pd.concat([feature[pos], feature[~pos]], axis=1)
split_abs = pd.concat([feature_abs[pos], feature_abs[~pos]], axis=1)
sns.displot(split, bins=30, alpha=0.5)
sns.displot(split_abs, bins=30, alpha=0.5)

### Check for missing values

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
!pip install git+git://github.com/AutoViML/AutoViz.git -q
!pip install xlrd -q

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class

AV = AutoViz_Class()
dftc = AV.AutoViz(
    filename='', 
    sep='' , 
    depVar='claim', 
    dfte=train_df, 
    header=0, 
    verbose=1, 
    lowess=False, 
    chart_format='png',
    max_rows_analyzed=1000000
)