In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.impute import SimpleImputer
warnings.filterwarnings("ignore")

## Data Preparation

In [None]:
df=pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test=pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
sample=pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

In [None]:
df.head()

In [None]:
useful_features=[col for col in test.columns if col not in ['id']]

## Data Info

In [None]:
print("no. of features in train set: ", df.shape[1])
print("no. of features in test set: ", test.shape[1])
print("--------------------------------------")
print("no. of observations in train set: ",len(df))
print("no. of observations in train set: ",len(test))

In [None]:
datatype=[]
for col in useful_features:
    datatype.append(df[col].dtype)
DataType=pd.DataFrame()
DataType['features']=useful_features[:59]
DataType['data type']=datatype[:59]
DataType[' features ']=useful_features[59:]
DataType[' data type ']=datatype[59:]
print(DataType)

- No Categorical columns in our data

## Null Values

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cmap='viridis')

In [None]:
#Null Values in df
print("-----NULL VALUES IN TRAIN DATA------")
nulls=[]
for col in useful_features:
    nulls.append(df[col].isna().sum())

null_df=pd.DataFrame()
null_df['columns']=useful_features
null_df['nulls']=nulls
null_df[r'null%']=null_df['nulls']*100/len(df)
null_df.head(10)

In [None]:
#Null Values in test
print("------NULL VALUES IN TEST DATA-------")
nulls=[]
for col in useful_features:
    nulls.append(test[col].isna().sum())

null_test=pd.DataFrame()
null_test['columns']=useful_features
null_test['nulls']=nulls
null_test[r'null%']=null_test['nulls']*100/test.shape[0]
null_test.head(10)

In [None]:
print("------NUMBER OF NULL VALUES IN TRAIN AND TEST SETS-------")
plt.figure(figsize=(15,5))
a=sns.scatterplot(x=null_test['columns'],y=null_test['nulls'],color='r',label='test nulls')
sns.scatterplot(x=null_df['columns'],y=null_df["nulls"],color='b',ax=a,label='train nulls')
a.legend(loc='center right',prop={'size':15})

- null values of train set are between 15168 to 15678 for each feature
- null values of test set are between 7733 to 8141 for each feature

In [None]:
print("------NUMBER OF NULL VALUES(%) IN TRAIN AND TEST SETS-------")
plt.figure(figsize=(15,5))
a=sns.scatterplot(x=null_test['columns'],y=null_test['null%'],color='r',label='test nulls')
sns.scatterplot(x=null_df['columns'],y=null_df["null%"],color='b',ax=a,label='train nulls')
a.legend(loc='lower right',prop={'size':15})
plt.grid(which='major',linewidth=2)

- Null values in both train and test sets lie between 1.56% - 1.66%

In [None]:
#Data Loss if we drop null values
print("data loss if we drop null values = {0:.2f}%".format(len(df.dropna())*100/len(df)))

- we cannot drop all the null values as it will be a huge data loss

## Data Distributions

In [None]:
#target distribution
sns.countplot(df['claim'])

- Distribution of target seems balanced

In [None]:
print("No. of 0 claim observations/data ",df['claim'].value_counts()[0])
print("No. of 1 claim observations/data ",df['claim'].value_counts()[1])

In [None]:
#distribution check
print("----FEATURE DISTRIBUTIONS OF TRAIN AND TEST SETS-----")
plt.figure(figsize=(15,30))
for i,col in enumerate(useful_features):
    plt.subplot(20,6,i+1)
    x1=sns.distplot(df[col],color='#e74c3c',label='train')
    sns.distplot(test[col],color='#2ecc71',ax=x1,label='test')
    x1.legend(loc='upper right',prop={'size':5})
plt.show()

- distribution of both train and test sets seems very similar
- only few features are normally distributed, remaining are either skewed or multimodal distributions

In [None]:
#distribution of features for claim=0 and claim=1
print("---FEATURE DISTRIBUTIONS FOR DIFFERENT CLAIM VALUES(0 & 1)---")
plt.figure(figsize=(15,30))
for i,col in enumerate(useful_features):
    plt.subplot(20,6,i+1)
    x1=sns.distplot(df[df['claim']==0][col],color='#e74c3c',label='claim 0')
    sns.distplot(df[df['claim']==1][col],color='#2ecc71',ax=x1,label='claim 1')
    x1.legend(loc='upper right',prop={'size':5})
plt.show()

- Distributions of data according to different target values seem similar

In [None]:
#feature distribution without nulls and with nulls replaced with mean
imputer=SimpleImputer(strategy='mean')
df_impute=imputer.fit_transform(df[useful_features])
test_impute=imputer.transform(test[useful_features])
df_impute=pd.DataFrame(df_impute, columns=useful_features)
test_impute=pd.DataFrame(test_impute, columns=useful_features)

In [None]:
#distribution check
print("TRAIN FEATURE DISTRIBUTION WITHOUT NULLS AND WITH NULLS REPLACED WITH MEAN")
plt.figure(figsize=(15,30))
for i,col in enumerate(useful_features):
    plt.subplot(20,6,i+1)
    x1=sns.distplot(df[col],color='#e74c3c',label='With Nulls')
    sns.distplot(df_impute[col],color='#2ecc71',ax=x1,label='Without Nulls')
    x1.legend(loc='upper right',prop={'size':5})
plt.show()

- There is some change in distributions after imputing null values with mean
- A sharp peak can be observed in distributions when null values are imputed

In [None]:
#distribution check
print("TEST FEATURE DISTRIBUTION WITHOUT NULLS AND WITH NULLS REPLACED WITH MEAN")
plt.figure(figsize=(15,30))
for i,col in enumerate(useful_features):
    plt.subplot(20,6,i+1)
    x1=sns.distplot(test[col],color='#e74c3c',label='With Nulls')
    sns.distplot(test_impute[col],color='#2ecc71',ax=x1,label='Without Nulls')
    x1.legend(loc='upper right',prop={'size':5})
plt.show()

- In test data also, similar peaks can be observed after imputing null values with mean

## Cardinality

In [None]:
#No. of unique values in each faeture(cardinality check)
print("---NUMBER OF UNIQUE VALUES IN EACH FEATURE----")
unique_train=[]
unique_test=[]
for col in useful_features:
    unique_train.append(df[col].nunique())
    unique_test.append(test[col].nunique())
uniques=pd.DataFrame()
uniques['feature']=useful_features
uniques['train_unique']=unique_train
uniques['test_unique']=unique_test

uniques.head()

In [None]:
print("----NO. OF UNIQUE VALUES IN EACH FEATURE FOR TRAIN AND TEST DATA----")
uniques=uniques.sort_values(by='train_unique')
plt.figure(figsize=(15,6))
a=sns.barplot(x=uniques.feature[:40],y=uniques.train_unique, color="#e74c3c",label='train_uniques')
sns.barplot(x=uniques.feature[:40],y=uniques.test_unique,ax=a,color='b',label='test_uniques')
plt.ylabel("unique counts")
a.legend(loc="upper left")

In [None]:
print("---NO. OF UNIQUE VALUES IN EACH FEATURE FOR TRAIN AND TEST DATA----")
uniques=uniques.sort_values(by='train_unique')
plt.figure(figsize=(15,6))
a=sns.barplot(x=uniques.feature[40:],y=uniques.train_unique, color="#e74c3c",label='train_uniques')
sns.barplot(x=uniques.feature[40:],y=uniques.test_unique,ax=a,color='b',label='test_uniques')
plt.ylabel("unique counts")
a.legend(loc="upper left")

feature f97 has the lowest cardinality while feature f10 has the highest cardinality

In [None]:
print('f97 has the lowest cardinality in train data with unique values: ',df['f97'].nunique())
print('f97 has the lowest cardinality in test data with unique values: ',test['f97'].nunique())

In [None]:
plt.figure(figsize=(15,8))
a=sns.histplot(df['f97'], bins=429)
sns.histplot(test['f97'],bins=431,ax=a,color='r')

## Correlation

In [None]:
a=df.corr()

cor=[]
for col in useful_features:
    cors=pd.DataFrame()
    cors['feature1']=a[col].index.tolist()
    cors['feature2']=col
    cors['corelation value']=a[col].values.tolist()
    cor.append(cors)
    
core=pd.DataFrame()
for i in cor:
    core=pd.concat([core,i],axis=0,ignore_index=True)
core=core[core['feature1']!=core['feature2']]

In [None]:
core=core.sort_values(by='corelation value')
print('---corelation table with least corelation values----')
core.head(10)

In [None]:
core=core.sort_values(by='corelation value',ascending=False)
print('---corelation table with maximum corelation values---')
core.head(10)

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(),cmap='viridis')