## The Chi Square Test Of Independence

In [1]:
# Importing Important Libraries
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import pandas as pd

In [2]:
# Loading dataset with the help of pandas
data = pd.read_csv("housingData-Real.csv")

In [3]:
# head of the data
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
# Allfeatures of the dataset
data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [5]:
# info about attributes
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
id               21613 non-null int64
date             21613 non-null object
price            21613 non-null float64
bedrooms         21613 non-null int64
bathrooms        21613 non-null float64
sqft_living      21613 non-null int64
sqft_lot         21613 non-null int64
floors           21613 non-null float64
waterfront       21613 non-null int64
view             21613 non-null int64
condition        21613 non-null int64
grade            21613 non-null int64
sqft_above       21613 non-null int64
sqft_basement    21613 non-null int64
yr_built         21613 non-null int64
yr_renovated     21613 non-null int64
zipcode          21613 non-null int64
lat              21613 non-null float64
long             21613 non-null float64
sqft_living15    21613 non-null int64
sqft_lot15       21613 non-null int64
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB


In [6]:
# The following class contains functions that are used to give important features

In [7]:
class ChiSquare:
    def __init__(self, table):
        self.observedTable = table     # The passes contingency matrix is observed table
        self.dof = None                # defined in function "degreeOfFreedom()"
        self.expectedTable = list(map(list, self.observedTable))  # matrix with the same shape as observedtable
        self.criticalValue = None                               # Value obatined from chi square distribution table
        self.chiSquareValue = 0                         # based on differences of observed and expected frequencies
        self.alpha = 0.05                                # threshold value initialised with some value to make 
                                           # decision about null hypothesis, known as level of significance
        self.pValue = None                 # compared with alpha to give result
        
    def calculateExpectedTable(self):
        rows, cols = np.shape(self.observedTable)     # shape same as that of observed table
        rowSum = [0 for _ in range(rows)]
        colSum = [0 for _ in range(cols)]
        totalSum = 0
        for i in range(rows):
            for j in range(cols):
                rowSum[i] += self.observedTable[i][j] 
                colSum[j] += self.observedTable[i][j]
                totalSum += self.observedTable[i][j]
        for i in range(rows):
            for j in range(cols):                                     # Cell values corresponds to marginal sum
                self.expectedTable[i][j] = (rowSum[i]*colSum[j])/totalSum 
        return self.expectedTable
    
    def degreeOfFreedom(self):
        rows, cols = np.shape(self.expectedTable)
        self.dof = (rows - 1)*(cols - 1)
        return self.dof
    
    def findTestStatistics(self):
        rows, cols = np.shape(self.expectedTable)
        chiSquareValue = 0
        for row in range(rows):
            for col in range(cols):
                obsValue = self.observedTable[row][col]
                expValue = self.expectedTable[row][col]
                self.chiSquareValue += pow((obsValue - expValue ), 2)/expValue
        return self.chiSquareValue
    
    def findCriticalValue(self):                            # Obtained from chi square distribution table
        self.criticalValue = chi2.ppf(1-self.alpha, self.dof)
        return self.criticalValue
    
    def findPValue(self):                  # Obtained from chi sqaure distribution table
        _, self.p,_,_ = chi2_contingency(self.observedTable)
        return self.p
    
    def findParameters(self): # obtains all required values by calling their functions
        return self.calculateExpectedTable(), self.findPValue(), self.degreeOfFreedom(), self.findTestStatistics(), self.findCriticalValue()
        
    def printResult(self, firstColumn):           # Use to print reult, whether feature is important or not
        if abs(self.chiSquareValue) >= self.criticalValue:
            print(firstColumn, ": Important for prediction!")
        else:
            print(firstColumn, " : NOT Important for prediction!")

In [8]:
attributes = []
for col in data.columns:
    attributes.append(col)

In [9]:
# System not responding for those many attributes.
attributes.remove('id')
attributes.remove('date')
attributes.remove('zipcode')
attributes.remove('yr_renovated')
attributes.remove('yr_built')
attributes.remove('lat')
attributes.remove('sqft_lot15')
attributes.remove('view')
attributes.remove('condition')
attributes.remove('grade')
attributes.remove('sqft_lot')

In [10]:
attributes

['price',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'floors',
 'waterfront',
 'sqft_above',
 'sqft_basement',
 'long',
 'sqft_living15']

In [11]:
# Adding a feature dummy so that my function can give it as "NOT Important" if it works good.
data['dummy'] = np.random.choice([0,1,2,3], size = (len(data)), p = [0.25, 0.25, 0.25, 0.25])
attributes.append('dummy')
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,dummy
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,2
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,1
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,770,0,1933,0,98028,47.7379,-122.233,2720,8062,2
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,1
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,0


In [12]:
def testIndependence(data, firstColumn, secondColumn):   # taking two features. "firstColumn" is being tested
    if firstColumn == secondColumn:                      # Against target feature "secondColumn" to determine
        return                                           # them whether they are important or not
    first = data[firstColumn]
    second = data[secondColumn]
    table = pd.crosstab(second, first)                  # contingency table using first and second
    chiSquareTest = ChiSquare(table.values)            # making object of the class
    expectedTable, pValue, dof, chiSquareValue, criticalValue = chiSquareTest.findParameters()  # finding important values
    chiSquareTest.printResult(firstColumn)             # printing result

In [13]:
for attribute in attributes:          # Each attribute is tested against "price" attribute
    testIndependence(data, firstColumn= attribute, secondColumn="price")

bedrooms : Important for prediction!
bathrooms : Important for prediction!
sqft_living : Important for prediction!
floors : Important for prediction!
waterfront : Important for prediction!
sqft_above : Important for prediction!
sqft_basement : Important for prediction!
long  : NOT Important for prediction!
sqft_living15 : Important for prediction!
dummy  : NOT Important for prediction!


In [13]:
# As added feature dummy also turns out "NOT Important", our test worked well

In [14]:
# Now as per our Test result we can't take dummy or "long" for prediction

In [15]:
# Selecting Columns

In [17]:
waterfront = np.array(data["price"])
bathrooms = np.array(data["bathrooms"])

In [18]:
# Creating Contingency matrix
table = np.array([waterfront, bathrooms])

In [19]:
chiSquareTest2 = ChiSquare(table)
expectedTable, pValue, dof, chiSquareValue, criticalValue = chiSquareTest2.findParameters()
print("The test statistic :",chiSquareValue)
print("The p-value of the test :",pValue)
print("Degrees of freedom :",dof)

# Taking Values upto 3 decimal points
print('critical = %.3f, stat = %.3f \n' % (criticalValue, chiSquareValue))

if abs(chiSquareValue) >= criticalValue:           # taking absolute value of the test statistics
    print("Dependent (Reject Null Hypothesis)\n")
else:
    print("Independent (Accept Null Hypothesis)\n\n")
    
    
alpha = chiSquareTest2.alpha     # Level of significance is 5%
print("significance = %.3f, p = %.3f\n" % (alpha, pValue))
if pValue <= alpha:
    print("Dependent (Reject Null Hypothesis)")
else:
    print("Independent (Accept Null Hypothesis)")

The test statistic : 11021.474715914688
The p-value of the test : 1.0
Degrees of freedom : 21612
critical = 21955.105, stat = 11021.475 

Independent (Accept Null Hypothesis)


significance = 0.050, p = 1.000

Independent (Accept Null Hypothesis)
