In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Creating path variable to read train data
INTERIM_DATA_PATH = os.path.abspath(os.path.join(os.getcwd(),os.path.pardir,'data','interim','train_set.csv'))

In [15]:
# Creating path variable to read train data
PREPROCESSING_DIRECTORY = os.path.abspath(os.path.join(os.getcwd(),os.path.pardir,'src','data'))

In [3]:
df = pd.read_csv(INTERIM_DATA_PATH)

In [11]:
import scipy.stats as stats
from scipy.stats import chi2_contingency

class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #P-Value
        self.chi2 = None #Chi Test Statistic
        self.dof = None
        
        self.dfObserved = None
        self.dfExpected = None
        self.noImportantCols = []
        
    def _print_chisquare_result(self, colX, alpha):
        result = ""
        if self.p<alpha:
            result="{0} is IMPORTANT for Prediction".format(colX)
        else:
            result="{0} is NOT an important predictor. (Discard {0} from model)".format(colX)
            self.noImportantCols.append(colX)

        print(result)
        
    def TestIndependence(self,colX,colY, alpha=0.05):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX,alpha)

In [5]:
df.drop(['Unnamed: 0'],axis=1, inplace =True)

In [12]:
X = df.drop([ 'Claim'], axis=1)
ct = ChiSquare(df)
for c in X.columns:
    ct.TestIndependence(c, 'Claim')

Agency is IMPORTANT for Prediction
Agency Type is IMPORTANT for Prediction
Distribution Channel is IMPORTANT for Prediction
Duration is IMPORTANT for Prediction
Net Sales is IMPORTANT for Prediction
Commision (in value) is IMPORTANT for Prediction
Age is IMPORTANT for Prediction
Product Name_1 way Comprehensive Plan is IMPORTANT for Prediction
Product Name_2 way Comprehensive Plan is IMPORTANT for Prediction
Product Name_24 Protect is NOT an important predictor. (Discard Product Name_24 Protect from model)
Product Name_Annual Gold Plan is IMPORTANT for Prediction
Product Name_Annual Silver Plan is IMPORTANT for Prediction
Product Name_Annual Travel Protect Gold is IMPORTANT for Prediction
Product Name_Annual Travel Protect Platinum is NOT an important predictor. (Discard Product Name_Annual Travel Protect Platinum from model)
Product Name_Annual Travel Protect Silver is NOT an important predictor. (Discard Product Name_Annual Travel Protect Silver from model)
Product Name_Basic Plan is

Destination_MAURITIUS is NOT an important predictor. (Discard Destination_MAURITIUS from model)
Destination_MEXICO is NOT an important predictor. (Discard Destination_MEXICO from model)
Destination_MOLDOVA, REPUBLIC OF is IMPORTANT for Prediction
Destination_MONGOLIA is NOT an important predictor. (Discard Destination_MONGOLIA from model)
Destination_MOROCCO is NOT an important predictor. (Discard Destination_MOROCCO from model)
Destination_MYANMAR is IMPORTANT for Prediction
Destination_NAMIBIA is IMPORTANT for Prediction
Destination_NEPAL is NOT an important predictor. (Discard Destination_NEPAL from model)
Destination_NETHERLANDS is NOT an important predictor. (Discard Destination_NETHERLANDS from model)
Destination_NEW CALEDONIA is IMPORTANT for Prediction
Destination_NEW ZEALAND is NOT an important predictor. (Discard Destination_NEW ZEALAND from model)
Destination_NIGERIA is IMPORTANT for Prediction
Destination_NORTHERN MARIANA ISLANDS is IMPORTANT for Prediction
Destination_NORW

**Insights**
There are 91 columns which are not important according to Chi square test

In [19]:
noImportantCols=pd.DataFrame(ct.noImportantCols)

In [20]:
noImportantCols.to_csv(os.path.join(PREPROCESSING_DIRECTORY,'noImoportantCols.csv'))