# Libraries and Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%time

pd.pandas.set_option('display.max_columns',None)
pd.pandas.set_option('display.max_rows',None)

# Train Data
data = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
print("Train Shape:", data.shape)

data.head(10)

In [None]:
print("Checking data Types:")
print(data.dtypes)

# Reducing some memory

In [None]:
# For Checking memory taken of the dataset 

def memory_check(df):
    if isinstance(df,pd.DataFrame):
        usage_b = df.memory_usage(deep=True).sum()
    else: 
        usage_b = df.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [None]:
# Lets convert the columns with float64 dtype to float; so that we save some memory

get_float = data.select_dtypes(include=['float'])
converted_float = get_float.apply(pd.to_numeric,downcast='float')

In [None]:
new_data = data.copy()
new_data[converted_float.columns] = converted_float
print("Memory Allocation Comparison:")
print("Original data:", memory_check(data))
print("Updated data:", memory_check(new_data))

**The updated data has much less memory than original data**

In [None]:
# Lets clear the original data

import gc
del data
gc.collect()

# EDA

## Data Description

In [None]:
new_data.describe()

## Missing Data

In [None]:
# Features having missing values with their missing percent

missing_features = [cols for cols in new_data.columns if new_data[cols].isnull().sum()>1]

for feature in missing_features:
    print(feature, np.round(new_data[feature].isnull().mean(), 4),  ' % missing values')

**Almost every feature has some missing values**

## Lets check if missing values has any impact with the Target feature

In [None]:
for feature in missing_features:
    data = new_data.copy()
    
    # Let's make a variable that indicates 1 if the observation was missing or zero otherwise
    data[feature] = np.where(data[feature].isnull(), 1, 0)
    
    # Grouping by value count of claim and that feature
    data.groupby(feature)['claim'].value_counts().unstack().plot.bar()
    plt.title(feature)
    plt.show()

**So it tells that almost all features with no NA rows have almost equal claim values(0 in feature) and for NA rows it has some what claim=1 values more**

## Lets check Discrete and Continuous numerical variables

### Discrete Features

In [None]:
# Get all numerical Features

numerical_features = [feature for feature in new_data.columns if new_data[feature].dtypes != 'O' and feature not in ['claim']]
print('Number of numerical variables: ', len(numerical_features))

In [None]:
# Discrete features: Numerical features with their unique values less than 30

discrete_features = [feature for feature in numerical_features if len(new_data[feature].unique())<30 and feature not in ['Id']]
print("Discrete Variables Count: {}".format(len(discrete_features)))

**So, all out independent features are Continuous**

### Continuous Features

In [None]:
# Get all continuous features

continuous_features = [feature for feature in numerical_features if feature not in discrete_features and feature not in ['id']]
print("Continuous feature Count {}".format(len(continuous_features)))

In [None]:
# Lets check the distribution of each continuous variables

for feature in continuous_features:
    data = new_data.copy()
    data[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.title(feature)
    plt.show()

**Most of the features are skewly distributed; we can apply log transformation or Box-Cox transformation for handling it, keeping in mind the range of its values**

## Outliers

In [None]:
for feature in continuous_features:
    data = new_data.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.show()

**Features have a lot of outliers to handle**

## Correlation Plot

In [None]:
#Correlation check
import seaborn as sns

corr = new_data.iloc[:,1:].corr()
plt.subplots(figsize=(22,20))
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns)
plt.show()

**No correlation can be seen among features**

**All the above analysis are done only on training dataset; same shall be done in testing to understand the features. The features have to be clean before starting. Also there are so many features, so we can apply PCA transformation to reduce some of these features before modelling.**