In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Data Description
In this competition, you will predict the probability that an auto insurance policy holder files a claim.

In the train and test data, features that belong to similar groupings are tagged as such in the feature names (e.g., ind, reg, car, calc). In addition, feature names include the postfix bin to indicate binary features and cat to indicate categorical features. Features without these designations are either continuous or ordinal. Values of -1 indicate that the feature was missing from the observation. The target columns signifies whether or not a claim was filed for that policy holder.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats #For Chi-square Test

In [None]:
input_path = '/kaggle/input/porto-seguro-safe-driver-prediction/'
df = pd.read_csv(input_path+"train.csv")
df.shape

#### Total Records = 595212
#### Total Features = 59

In [None]:
df.info()

## Target Varible

In [None]:
df['target'].unique()

In [None]:
count = {'count': df['target'].value_counts()}
target_info_df = pd.DataFrame(count)

In [None]:
target_info_df['Percent'] = (target_info_df/df.shape[0])*100

In [None]:
target_info_df

In [None]:
target = df['target']

In [None]:
df.drop(columns=['target'],inplace=True)

- It looks like we have imbalance dataset
- With Predominent 0's

## Creating Metadata Dataframe

In [None]:
df_metedata = pd.DataFrame({'DTypes':df.dtypes})

- Making all the features ending with __'_bin'__ and __'_cat'__ are now maked as categorical

In [None]:
for col in df.columns:
    if '_cat' in col or '_bin' in col:
        df_metedata.loc[col,'DTypes'] = 'Categorical'
    elif df[col].dtype == 'int64':
        df_metedata.loc[col,'DTypes'] = 'int64'
    elif df[col].dtype == 'float64':
        df_metedata.loc[col,'DTypes'] = 'float64'

- Among the remaining columns that were not gone with D-Type check up we know Float should be Continious and Int type can be either Continious or Ordinal

- The only question is wheather the features of Data Type Int are Continious or Ordinal?

In [None]:
df_metedata['Dropped']=False
df_metedata['Missing'] = np.nan

In [None]:
df_metedata.loc['id','Missing'] = np.nan
df_metedata.loc['id','Dropped'] = True

## Missing Values
- Missing values of Categorical Varibles are filled with __MODE__
- Missing values of Continious Varibles are filled with __MEAN__
- We are unsure which features of type int are Ordinal and Continious.So we will fill them with __MODE__ which can be done in both cases
- __NOTE__: Missing values are referred as -1 in the dataset

- Let's Replace -1 with __NAN__

In [None]:
df.replace(to_replace=-1,value=np.nan,inplace=True)

- Sort the Feature with missing values

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
(411231/df.shape[0])*100

- DROP 'ps_car_03_cat' as it has many missing values

In [None]:
df_metedata.loc['ps_car_03_cat','Dropped'] = True

In [None]:
df.drop(columns=['ps_car_03_cat'],inplace=True)

In [None]:
for col in df.columns:
    if '_cat' in col or '_bin' in col:
        df[col].fillna(int(df[col].mode()[0]),inplace=True)
        df_metedata.loc[col,'Missing'] = int(df[col].mode()[0])
    elif df[col].dtype == 'int64':
        df[col].fillna(int(df[col].mode()[0]),inplace=True)
        df_metedata.loc[col,'Missing'] = int(df[col].mode()[0])
    else:
        df[col].fillna(df[col].mean(),inplace=True)
        df_metedata.loc[col,'Missing'] = df[col].mean()

In [None]:
df.columns

### Making Data into multiple folds

##### The idea is to make the whole data in to n-folds such that: 
- Taking equal porportions of majority class and keeping the minority intact
- Each fold is a balance dataset

In [None]:
df_minority = df.loc[target == 1].copy()
df_minority['target'] = target.loc[target == 1].copy()

In [None]:
df_majority = df.loc[target == 0].copy()
df_majority['target'] = target.loc[target == 0].copy()

In [None]:
df_majority.shape, df_minority.shape

- Lets make the data in to 20 folds

In [None]:
splitted_frame = np.array_split(df_majority, 20)

## Creating the chi-square test function

In [None]:
def chi2_test(col):
    count = 0
    splitted_frames = splitted_frame
    for frame in splitted_frames:
        chunks = [frame,df_minority]
        df_test = pd.concat(chunks)
        crosstab_col = pd.crosstab(df_test[col],df_test['target'])
        pValue = scipy.stats.chi2_contingency(crosstab_col)[1]
        if pValue < 0.05:
            count = count + 1
    if count >= 10:
        print('Consider this feature')
    else:
        print("Don't consider this feature")

###  Creating the ANOVA test function

In [None]:
def anova_test(col):
    count = 0
    splitted_frames = splitted_frame
    for frame in splitted_frames:
        chunks = [frame,df_minority]
        df_test = pd.concat(chunks)
        pValue = scipy.stats.f_oneway(df_test[col],df_test['target'])[1]
        if pValue < 0.05:
            count = count + 1
    if count >= 10:
        print('Consider this feature')
    else:
        print("Don't consider this feature")

### Feature __ps_ind_01__

In [None]:
df['ps_ind_01'].nunique()

- It is highly improblable for the feature __ps_ind_01__ to be a Continious with only 8 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_ind_01','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_ind_01 = pd.crosstab(df['ps_ind_01'],target)
crosstab_ps_ind_01

In [None]:
chi2_test('ps_ind_01')

- Correlation exsist between ps_ind_01 and target
- Considering this feature in to the model building

### Feature ps_ind_02_cat

In [None]:
df['ps_ind_02_cat'].nunique()

In [None]:
df_metedata.loc['ps_ind_02_cat','DTypes']

In [None]:
crosstab_ps_ind_02_cat = pd.crosstab(df['ps_ind_02_cat'],target)
crosstab_ps_ind_02_cat

In [None]:
chi2_test('ps_ind_02_cat')

- Correlation exsist between ps_ind_02_cat	 and target
- Considering this feature in to the model building

### Feature ps_ind_03

In [None]:
df['ps_ind_03'].nunique()

In [None]:
df_metedata.loc['ps_ind_03','DTypes']

- It his highly imporblable for the feature ps_ind_03 to be continious with only 12 unique values among 595212, so it has to be ordinal

In [None]:
crosstab_ps_ind_03 = pd.crosstab(df['ps_ind_03'],target)
crosstab_ps_ind_03

In [None]:
chi2_test('ps_ind_03')

- Correlation exsist between ps_ind_03 and target
- Considering this feature in to the model building

### Feature ps_ind_04_cat

In [None]:
df['ps_ind_04_cat'].nunique()

In [None]:
crosstab_ps_ind_04_cat = pd.crosstab(df['ps_ind_04_cat'],target)
crosstab_ps_ind_04_cat

In [None]:
chi2_test('ps_ind_04_cat')

- Correlation exsist between ps_ind_04_cat and target
- Considering this feature in to the model building

### Feature ps_ind_05_cat

In [None]:
df['ps_ind_05_cat'].nunique()

In [None]:
crosstab_ps_ind_05_cat = pd.crosstab(df['ps_ind_05_cat'],target)
crosstab_ps_ind_05_cat

In [None]:
chi2_test('ps_ind_05_cat')

- Correlation exsist between ps_ind_05_cat and target
- Considering this feature in to the model building

### Feature ps_ind_06_bin

In [None]:
df['ps_ind_06_bin'].nunique()

In [None]:
crosstab_ps_ind_06_bin = pd.crosstab(df['ps_ind_06_bin'],target)
crosstab_ps_ind_06_bin

In [None]:
chi2_test('ps_ind_06_bin')

- Correlation exsist between ps_ind_06_bin and target
- Considering this feature in to the model building

### Feature ps_ind_07_bin

In [None]:
df['ps_ind_07_bin'].nunique()

In [None]:
crosstab_ps_ind_07_bin = pd.crosstab(df['ps_ind_07_bin'],target)
crosstab_ps_ind_07_bin

In [None]:
chi2_test('ps_ind_07_bin')

- Correlation exsist between ps_ind_07_bin and target
- Considering this feature in to the model building

### Feature ps_ind_08_bin

In [None]:
df['ps_ind_08_bin'].nunique()

In [None]:
crosstab_ps_ind_08_bin = pd.crosstab(df['ps_ind_08_bin'],target)
crosstab_ps_ind_08_bin

In [None]:
chi2_test('ps_ind_08_bin')

- Correlation exsist between ps_ind_08_bin and target
- Considering this feature in to the model building

### Feature ps_ind_09_bin

In [None]:
df['ps_ind_09_bin'].nunique()

In [None]:
crosstab_ps_ind_09_bin = pd.crosstab(df['ps_ind_09_bin'],target)
crosstab_ps_ind_09_bin

In [None]:
chi2_test('ps_ind_09_bin')

- Correlation exsist between ps_ind_09_bin and target
- Considering this feature in to the model building

### Feature ps_ind_10_bin

In [None]:
df['ps_ind_10_bin'].nunique()

In [None]:
crosstab_ps_ind_10_bin = pd.crosstab(df['ps_ind_10_bin'],target)
crosstab_ps_ind_10_bin

In [None]:
chi2_test('ps_ind_10_bin')

- Correlation doesn't exsist between ps_ind_10_bin and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_ind_10_bin','Dropped'] = True

### Feature ps_ind_11_bin

In [None]:
df['ps_ind_11_bin'].nunique()

In [None]:
crosstab_ps_ind_11_bin = pd.crosstab(df['ps_ind_11_bin'],target)
crosstab_ps_ind_11_bin

In [None]:
chi2_test('ps_ind_11_bin')

- Correlation doesn't exsist between ps_ind_11_bin and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_ind_11_bin','Dropped'] = True

### Feature ps_ind_12_bin

In [None]:
df['ps_ind_12_bin'].nunique()

In [None]:
crosstab_ps_ind_12_bin = pd.crosstab(df['ps_ind_12_bin'],target)
crosstab_ps_ind_12_bin

In [None]:
chi2_test('ps_ind_12_bin')

- Correlation exsist between ps_ind_12_bin and target
- Considering this feature in to the model building

### Feature ps_ind_13_bin

In [None]:
df['ps_ind_13_bin'].nunique()

In [None]:
crosstab_ps_ind_13_bin = pd.crosstab(df['ps_ind_13_bin'],target)
crosstab_ps_ind_13_bin

In [None]:
chi2_test('ps_ind_13_bin')

- Correlation doesn't exsist between ps_ind_13_bin and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_ind_13_bin','Dropped'] = True

### Feature ps_ind_16_bin

In [None]:
df['ps_ind_16_bin'].nunique()

In [None]:
crosstab_ps_ind_16_bin = pd.crosstab(df['ps_ind_16_bin'],target)
crosstab_ps_ind_16_bin

In [None]:
chi2_test('ps_ind_16_bin')

- Correlation exsist between ps_ind_16_bin and target
- Considering this feature in to the model building

### Feature ps_ind_17_bin

In [None]:
df['ps_ind_17_bin'].nunique()

In [None]:
crosstab_ps_ind_17_bin = pd.crosstab(df['ps_ind_17_bin'],target)
crosstab_ps_ind_17_bin

In [None]:
chi2_test('ps_ind_17_bin')

- Correlation exsist between ps_ind_17_bin and target
- Considering this feature in to the model building

### Feature ps_ind_18_bin

In [None]:
df['ps_ind_18_bin'].nunique()

In [None]:
crosstab_ps_ind_18_bin = pd.crosstab(df['ps_ind_18_bin'],target)
crosstab_ps_ind_18_bin

In [None]:
chi2_test('ps_ind_18_bin')

- Correlation exsist between ps_ind_18_bin and target
- Considering this feature in to the model building

### Feature ps_car_01_cat

In [None]:
df['ps_car_01_cat'].nunique()

In [None]:
crosstab_ps_car_01_cat = pd.crosstab(df['ps_car_01_cat'],target)
crosstab_ps_car_01_cat

In [None]:
chi2_test('ps_car_01_cat')

- Correlation exsist between ps_car_01_cat and target
- Considering this feature in to the model building

### Feature ps_car_02_cat

In [None]:
df['ps_car_02_cat'].nunique()

In [None]:
crosstab_ps_car_02_cat = pd.crosstab(df['ps_car_02_cat'],target)
crosstab_ps_car_02_cat

In [None]:
chi2_test('ps_car_02_cat')

- Correlation exsist between ps_car_02_cat and target
- Considering this feature in to the model building

### Feature ps_car_04_cat

In [None]:
df['ps_car_04_cat'].nunique()

In [None]:
crosstab_ps_car_04_cat = pd.crosstab(df['ps_car_04_cat'],target)
crosstab_ps_car_04_cat

In [None]:
chi2_test('ps_car_04_cat')

- Correlation exsist between ps_car_04_cat and target
- Considering this feature in to the model building

### Feature ps_car_05_cat

In [None]:
df['ps_car_05_cat'].nunique()

In [None]:
crosstab_ps_car_05_cat = pd.crosstab(df['ps_car_05_cat'],target)
crosstab_ps_car_05_cat

In [None]:
chi2_test('ps_car_05_cat')

- Correlation exsist between ps_car_05_cat and target
- Considering this feature in to the model building

### Feature ps_car_06_cat

In [None]:
df['ps_car_06_cat'].nunique()

In [None]:
crosstab_ps_car_06_cat = pd.crosstab(df['ps_car_06_cat'],target)
crosstab_ps_car_06_cat

In [None]:
chi2_test('ps_car_06_cat')

- Correlation exsist between ps_car_06_cat and target
- Considering this feature in to the model building

### Feature ps_car_07_cat

In [None]:
df['ps_car_07_cat'].nunique()

In [None]:
crosstab_ps_car_07_cat = pd.crosstab(df['ps_car_07_cat'],target)
crosstab_ps_car_07_cat

In [None]:
chi2_test('ps_car_07_cat')

- Correlation exsist between ps_car_07_cat and target
- Considering this feature in to the model building

### Feature ps_car_08_cat

In [None]:
df['ps_car_08_cat'].nunique()

In [None]:
crosstab_ps_car_08_cat = pd.crosstab(df['ps_car_08_cat'],target)
crosstab_ps_car_08_cat

In [None]:
chi2_test('ps_car_08_cat')

- Correlation exsist between ps_car_08_cat and target
- Considering this feature in to the model building

### Feature ps_car_09_cat

In [None]:
df['ps_car_09_cat'].nunique()

In [None]:
crosstab_ps_car_09_cat = pd.crosstab(df['ps_car_09_cat'],target)
crosstab_ps_car_09_cat

In [None]:
chi2_test('ps_car_09_cat')

- Correlation exsist between ps_car_09_cat and target
- Considering this feature in to the model building

### Feature ps_car_10_cat

In [None]:
df['ps_car_10_cat'].nunique()

In [None]:
crosstab_ps_car_10_cat = pd.crosstab(df['ps_car_10_cat'],target)
crosstab_ps_car_10_cat

In [None]:
chi2_test('ps_car_10_cat')

- Correlation doesn't exsist between ps_car_10_cat and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_car_10_cat','Dropped'] = True

### Feature ps_car_11_cat

In [None]:
df['ps_car_11_cat'].nunique()

In [None]:
crosstab_ps_car_11_cat = pd.crosstab(df['ps_car_11_cat'],target)
crosstab_ps_car_11_cat

In [None]:
chi2_test('ps_car_11_cat')

- Correlation exsist between ps_car_11_cat and target
- Considering this feature in to the model building

### Feature ps_calc_15_bin

In [None]:
df['ps_calc_15_bin'].nunique()

In [None]:
crosstab_ps_calc_15_bin = pd.crosstab(df['ps_calc_15_bin'],target)
crosstab_ps_calc_15_bin

In [None]:
chi2_test('ps_calc_15_bin')

- Correlation doesn't exsist between ps_calc_15_bin and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_15_bin','Dropped']= True

### Feature ps_calc_16_bin

In [None]:
df['ps_calc_16_bin'].nunique()

In [None]:
crosstab_ps_calc_16_bin = pd.crosstab(df['ps_calc_16_bin'],target)
crosstab_ps_calc_16_bin

In [None]:
chi2_test('ps_calc_16_bin')

- Correlation doesn't exsist between ps_calc_16_bin and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_16_bin','Dropped'] = True

### Feature ps_calc_17_bin

In [None]:
df['ps_calc_17_bin'].nunique()

In [None]:
crosstab_ps_calc_17_bin = pd.crosstab(df['ps_calc_17_bin'],target)
crosstab_ps_calc_17_bin

In [None]:
chi2_test('ps_calc_17_bin')

- Correlation doesn't exsist between ps_calc_17_bin and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_17_bin','Dropped'] = True

### Feature ps_calc_18_bin

In [None]:
df['ps_calc_18_bin'].nunique()

In [None]:
crosstab_ps_calc_18_bin = pd.crosstab(df['ps_calc_18_bin'],target)
crosstab_ps_calc_18_bin

In [None]:
chi2_test('ps_calc_18_bin')

- Correlation doesn't exsist between ps_calc_18_bin and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_18_bin','Dropped'] = True

### ps_calc_19_bin

In [None]:
df['ps_calc_19_bin'].nunique()

In [None]:
crosstab_ps_calc_19_bin = pd.crosstab(df['ps_calc_19_bin'],target)
crosstab_ps_calc_19_bin

In [None]:
chi2_test('ps_calc_19_bin')

- Correlation doesn't exsist between ps_calc_19_bin and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_19_bin','Dropped'] = True

### Feature ps_calc_20_bin

In [None]:
df['ps_calc_20_bin'].nunique()

In [None]:
crosstab_ps_calc_20_bin = pd.crosstab(df['ps_calc_20_bin'],target)
crosstab_ps_calc_20_bin

In [None]:
chi2_test('ps_calc_20_bin')

- Correlation doesn't exsist between ps_calc_20_bin and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_20_bin','Dropped'] = True

### Feature ps_ind_14

In [None]:
df['ps_ind_14'].nunique()

In [None]:
df_metedata.loc['ps_ind_14','DTypes']

- It is highly improblable for the feature ps_ind_14 to be a Continious with only 5 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_ind_14','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_ind_14 = pd.crosstab(df['ps_ind_14'],target)
crosstab_ps_ind_14

In [None]:
chi2_test('ps_ind_14')

- Correlation exsist between ps_ind_14 and target
- Considering this feature in to the model building

### Feature ps_ind_15

In [None]:
df['ps_ind_15'].nunique()

- It is highly improblable for the feature ps_ind_15 to be a Continious with only 14 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_ind_15','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_ind_15 = pd.crosstab(df['ps_ind_15'],target)
crosstab_ps_ind_15

In [None]:
chi2_test('ps_ind_15')

- Correlation exsist between ps_ind_15 and target
- Considering this feature in to the model building

### Feature ps_reg_01

In [None]:
df['ps_reg_01'].nunique()

In [None]:
df_metedata.loc['ps_reg_01','DTypes']

In [None]:
df['ps_reg_01']

- It is highly improblable for the feature ps_reg_01 to be a Continious with only 10 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_reg_01','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_reg_01 = pd.crosstab(df['ps_reg_01'],target)
crosstab_ps_reg_01

In [None]:
chi2_test('ps_reg_01')

- Correlation exsist between ps_reg_01 and target
- Considering this feature in to the model building

### Feature ps_reg_02

In [None]:
df['ps_reg_02'].nunique()

In [None]:
df_metedata.loc['ps_reg_02','DTypes']

In [None]:
df['ps_reg_02'].value_counts()

- It is highly improblable for the feature ps_reg_02 to be a Continious with only 19 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_reg_02','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_reg_02 = pd.crosstab(df['ps_reg_02'],target)
crosstab_ps_reg_02

In [None]:
chi2_test('ps_reg_02')

- Correlation exsist between ps_reg_02 and target
- Considering this feature in to the model building

### Feature ps_reg_03

In [None]:
df['ps_reg_03'].nunique()

In [None]:
df_metedata.loc['ps_reg_03','DTypes']

In [None]:
df['ps_reg_03'].value_counts()

- It is continious Varible

In [None]:
df['ps_reg_03'].max()

In [None]:
df['ps_reg_03'].min()

In [None]:
fig,ax = plt.subplots(2,1,figsize=(14,8))
ax1,ax2 = ax.flatten()
sns.set_style("whitegrid")
sns.distplot(df['ps_reg_03'],ax=ax1)
sns.boxplot(x=target,y=df['ps_reg_03'],showmeans=True,ax=ax2)

In [None]:
anova_test('ps_reg_03')

- Correlation exsist between ps_reg_03 and target
- Considering this feature in to the model building

### Feature ps_car_11

In [None]:
df['ps_car_11'].nunique()

In [None]:
df['ps_car_11'].dtype

In [None]:
df_metedata.loc['ps_car_11','DTypes']

- It is highly improblable for the feature ps_car_11 to be a Continious with only 5 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_car_11','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_car_11 = pd.crosstab(df['ps_car_11'],target)
crosstab_ps_car_11

In [None]:
chi2_test('ps_car_11')

- Correlation exsist between ps_car_11 and target
- Considering this feature in to the model building

### Feature ps_car_12

In [None]:
df['ps_car_12'].nunique()

In [None]:
df_metedata.loc['ps_car_12','DTypes']

In [None]:
df['ps_car_12'].value_counts()

- It is continious Varible

In [None]:
fig,ax = plt.subplots(2,1,figsize=(14,8))
ax1,ax2 = ax.flatten()
sns.set_style("whitegrid")
sns.distplot(df['ps_car_12'],ax=ax1)
sns.boxplot(x=target,y=df['ps_car_12'],showmeans=True,ax=ax2)

In [None]:
df['ps_car_12'].max()

In [None]:
df['ps_car_12'].min()

In [None]:
anova_test('ps_car_12')

- Correlation exsist between ps_car_12 and target
- Considering this feature in to the model building

### Feature ps_car_13

In [None]:
df['ps_car_13'].nunique()

In [None]:
df_metedata.loc['ps_car_13','DTypes']

In [None]:
df['ps_car_13'].value_counts()

- It is Continious Varible

In [None]:
fig,ax = plt.subplots(2,1,figsize=(20,10))
ax1,ax2 = ax.flatten()
sns.distplot(df['ps_car_13'],ax=ax1)
sns.boxplot(x=target,y=df['ps_car_13'],showmeans=True,ax=ax2)

In [None]:
df['ps_car_13'].max()

In [None]:
anova_test('ps_car_13')

- Correlation exsist between ps_car_13 and target
- Considering this feature in to the model building

### Feature ps_car_14

In [None]:
df['ps_car_14'].nunique()

In [None]:
df_metedata.loc['ps_car_14','DTypes']

In [None]:
df['ps_car_14'].value_counts()

- It is continious varible

In [None]:
fig,ax = plt.subplots(2,1,figsize=(20,10))
ax1,ax2 = ax.flatten()
sns.distplot(df['ps_car_14'],ax=ax1)
sns.boxplot(x=target,y=df['ps_car_14'],showmeans=True,ax=ax2)

In [None]:
df['ps_car_14'].max()

In [None]:
df['ps_car_14'].min()

In [None]:
anova_test('ps_car_14')

- Correlation exsist between ps_car_14 and target
- Considering this feature in to the model building

### Feature ps_car_15

In [None]:
df['ps_car_15'].nunique()

In [None]:
df_metedata.loc['ps_car_15','DTypes']

In [None]:
df['ps_car_15'].value_counts()

- It is highly improblable for the feature ps_car_15 to be a Continious with only 15 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_car_15','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_car_15 = pd.crosstab(df['ps_car_15'],target)
crosstab_ps_car_15

In [None]:
chi2_test('ps_car_15')

- Correlation exsist between ps_car_15 and target
- Considering this feature in to the model building

### Feature ps_calc_01

In [None]:
df['ps_calc_01'].nunique()

In [None]:
df_metedata.loc['ps_calc_01','DTypes']

In [None]:
df['ps_calc_01'].value_counts()

- It is highly improblable for the feature ps_calc_01 to be a Continious with only 10 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_01','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_01 = pd.crosstab(df['ps_calc_01'],target)
crosstab_ps_calc_01

In [None]:
chi2_test('ps_calc_01')

- Correlation doesn't exsist between ps_calc_01 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_01','Dropped'] =True

### Feature ps_calc_02

In [None]:
df['ps_calc_02'].nunique()

In [None]:
df_metedata.loc['ps_calc_02','DTypes']

In [None]:
df['ps_calc_02'].value_counts()

- It is highly improblable for the feature ps_calc_02 to be a Continious with only 10 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_02','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_02 = pd.crosstab(df['ps_calc_02'],target)
crosstab_ps_calc_02

In [None]:
chi2_test('ps_calc_02')

- Correlation doesn't exsist between ps_calc_02 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_02','Dropped'] =True

### Feature ps_calc_03

In [None]:
df['ps_calc_03'].nunique()

In [None]:
df_metedata.loc['ps_calc_03','DTypes']

In [None]:
df['ps_calc_03'].value_counts()

- It is highly improblable for the feature ps_calc_03 to be a Continious with only 10 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_03','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_03 = pd.crosstab(df['ps_calc_03'],target)
crosstab_ps_calc_03

In [None]:
chi2_test('ps_calc_03')

- Correlation doesn't exsist between ps_calc_03 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_03','Dropped'] =True

### Feature ps_calc_04

In [None]:
df['ps_calc_04'].nunique()

In [None]:
df_metedata.loc['ps_calc_04','DTypes']

In [None]:
df['ps_calc_04'].value_counts()

- It is highly improblable for the feature ps_calc_04 to be a Continious with only 6 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_04','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_04 = pd.crosstab(df['ps_calc_04'],target)
crosstab_ps_calc_04

In [None]:
chi2_test('ps_calc_04')

- Correlation doesn't exsist between ps_calc_04 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_04','Dropped'] =True

### Feature ps_calc_05

In [None]:
df['ps_calc_05'].nunique()

In [None]:
df_metedata.loc['ps_calc_05','DTypes']

In [None]:
df['ps_calc_05'].value_counts()

- It is highly improblable for the feature ps_calc_05 to be a Continious with only 7 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_05','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_05 = pd.crosstab(df['ps_calc_05'],target)
crosstab_ps_calc_05

In [None]:
chi2_test('ps_calc_05')

- Correlation doesn't exsist between ps_calc_05 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_05','Dropped'] =True

### Feature ps_calc_06

In [None]:
df['ps_calc_06'].nunique()

In [None]:
df_metedata.loc['ps_calc_06','DTypes']

In [None]:
df['ps_calc_06'].value_counts()

- It is highly improblable for the feature ps_calc_06 to be a Continious with only 11 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_06','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_06 = pd.crosstab(df['ps_calc_06'],target)
crosstab_ps_calc_06

In [None]:
chi2_test('ps_calc_06')

- Correlation doesn't exsist between ps_calc_06 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_06','Dropped'] =True

### Feature ps_calc_07

In [None]:
df['ps_calc_07'].nunique()

In [None]:
df_metedata.loc['ps_calc_07','DTypes']

In [None]:
df['ps_calc_07'].value_counts()

- It is highly improblable for the feature ps_calc_07 to be a Continious with only 10 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_07','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_07 = pd.crosstab(df['ps_calc_07'],target)
crosstab_ps_calc_07

In [None]:
chi2_test('ps_calc_07')

- Correlation doesn't exsist between ps_calc_07 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_07','Dropped'] =True

### Feature ps_calc_08

In [None]:
df['ps_calc_08'].nunique()

In [None]:
df_metedata.loc['ps_calc_08','DTypes']

In [None]:
df['ps_calc_08'].value_counts()

- It is highly improblable for the feature ps_calc_08 to be a Continious with only 11 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_08','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_08 = pd.crosstab(df['ps_calc_08'],target)
crosstab_ps_calc_08

In [None]:
chi2_test('ps_calc_08')

- Correlation doesn't exsist between ps_calc_08 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_08','Dropped'] =True

### Feature ps_calc_09

In [None]:
df['ps_calc_09'].nunique()

In [None]:
df_metedata.loc['ps_calc_09','DTypes']

In [None]:
df['ps_calc_09'].value_counts()

- It is highly improblable for the feature ps_calc_09 to be a Continious with only 8 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_09','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_09 = pd.crosstab(df['ps_calc_09'],target)
crosstab_ps_calc_09

In [None]:
chi2_test('ps_calc_09')

- Correlation doesn't exsist between ps_calc_09 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_09','Dropped'] =True

### Feature ps_calc_10

In [None]:
df['ps_calc_10'].nunique()

In [None]:
df_metedata.loc['ps_calc_10','DTypes']

In [None]:
df['ps_calc_10'].value_counts()

-It is highly improblable for the feature ps_calc_10 to be a Continious with only 26 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_10','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_10 = pd.crosstab(df['ps_calc_10'],target)
crosstab_ps_calc_10

In [None]:
chi2_test('ps_calc_10')

- Correlation doesn't exsist between ps_calc_10 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_10','Dropped'] =True

### Feature ps_calc_11

In [None]:
df['ps_calc_11'].nunique()

In [None]:
df_metedata.loc['ps_calc_11','DTypes']

In [None]:
df['ps_calc_11'].value_counts()

- It is highly improblable for the feature ps_calc_11 to be a Continious with only 20 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_11','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_11 = pd.crosstab(df['ps_calc_11'],target)
crosstab_ps_calc_11

In [None]:
chi2_test('ps_calc_11')

- Correlation doesn't exsist between ps_calc_11 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_11','Dropped'] =True

### Feature ps_calc_12

In [None]:
df['ps_calc_12'].nunique()

In [None]:
df_metedata.loc['ps_calc_12','DTypes']

In [None]:
df['ps_calc_12'].value_counts()

- It is highly improblable for the feature ps_calc_12 to be a Continious with only 11 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_12','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_12 = pd.crosstab(df['ps_calc_12'],target)
crosstab_ps_calc_12

In [None]:
chi2_test('ps_calc_12')

- Correlation doesn't exsist between ps_calc_12 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_12','Dropped'] =True

### Feature ps_calc_13

In [None]:
df['ps_calc_13'].nunique()

In [None]:
df_metedata.loc['ps_calc_13','DTypes']

In [None]:
df['ps_calc_13'].value_counts()

- It is highly improblable for the feature ps_calc_13 to be a Continious with only 14 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_13','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_13 = pd.crosstab(df['ps_calc_13'],target)
crosstab_ps_calc_13

In [None]:
chi2_test('ps_calc_13')

- Correlation doesn't exsist between ps_calc_13 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_13','Dropped'] =True

### Feature ps_calc_14

In [None]:
df['ps_calc_14'].nunique()

In [None]:
df_metedata.loc['ps_calc_14','DTypes']

In [None]:
df['ps_calc_14'].value_counts()

- It is highly improblable for the feature ps_calc_14 to be a Continious with only 24 unique values among 595212 records..so it has to be ordinal even data seems like that

In [None]:
df_metedata.loc['ps_calc_14','DTypes'] = 'Ordinal'

In [None]:
crosstab_ps_calc_14 = pd.crosstab(df['ps_calc_14'],target)
crosstab_ps_calc_14

In [None]:
chi2_test('ps_calc_14')

- Correlation doesn't exsist between ps_calc_14 and target
- Not Considering this feature in to the model building

In [None]:
df_metedata.loc['ps_calc_14','Dropped'] =True

In [None]:
df_metedata

## MISSING VALUES AS PER NEW DATA TYPES

In [None]:
df_new = pd.read_csv(input_path+'train.csv')
target_new = df_new['target']
df_new.drop(columns=['target'],inplace = True)

In [None]:
df_new.replace(to_replace=-1,value=np.nan,inplace=True)

In [None]:
for columns in df_new.columns.values:
    if df_metedata.loc[columns,'Dropped']:
        df_new.drop(columns=[columns],inplace=True)

In [None]:
for col in df_new.columns:
    if ((df_metedata.loc[col,'DTypes'] == 'Categorical') or (df_metedata.loc[col,'DTypes'] == 'Ordinal')):
        df_new[col].fillna(df_new[col].mode()[0],inplace=True)
        df_metedata.loc[col,'Missing'] = df_new[col].mode()[0]
    else:
        df_new[col].fillna(df_new[col].mean(),inplace=True)
        df_metedata.loc[col,'Missing'] = df_new[col].mean()
        

In [None]:
df_new.shape

### Pickling the Metadata Dataframe

In [None]:
import pickle
working_path = '/kaggle/working'
pickle.dump(df_metedata,open(working_path+'porto-seguro-safe-driver-prediction_df_metedata_pickle','wb'))