## Chi-square Test

* The Chi Square statistic is commonly used for testing relationships between categorical variables in classification task 

In [1]:
import pandas as pd
import numpy as np

#load data
df = pd.read_csv('data/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.shape

(891, 12)

In [3]:
#get five features: Survived, Pclass, Sex, Name, embarked
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'Embarked']]
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked
0,0,3,male,22.0,S
1,1,1,female,38.0,C
2,1,3,female,26.0,S
3,1,1,female,35.0,S
4,0,3,male,35.0,S


In [4]:
#map values of feature sex; male:1, female:0
df['Sex'].replace({'male':1, 'female':0}, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked
0,0,3,1,22.0,S
1,1,1,0,38.0,C
2,1,3,0,26.0,S
3,1,1,0,35.0,S
4,0,3,1,35.0,S


In [5]:
#label encoding on Embarked feature
label = {k: i for i, k in enumerate(df['Embarked'].unique(), 0)}
df['Embarked'] = df['Embarked'].map(label)

In [6]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked
0,0,3,1,22.0,0
1,1,1,0,38.0,1
2,1,3,0,26.0,0
3,1,1,0,35.0,0
4,0,3,1,35.0,0


In [7]:
df['Age'] = np.where(df['Age']==True, 1,0)

In [8]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked
0,0,3,1,0,0
1,1,1,0,0,1
2,1,3,0,0,0
3,1,1,0,0,0
4,0,3,1,0,0


In [9]:
#independent features
X = df.drop(labels=['Survived'], axis=1)
#dependent features
y = df['Survived']

In [10]:
#split dataset into tran data and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [11]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Embarked
857,1,1,0,0
52,1,0,0,1
386,3,1,1,0
124,1,1,0,0
578,3,0,0,1


In [12]:
y_train.head()

857    1
52     1
386    0
124    0
578    0
Name: Survived, dtype: int64

In [13]:
X_train['Sex'].unique()

array([1, 0], dtype=int64)

In [14]:
X_train.isnull().sum()

Pclass      0
Sex         0
Age         0
Embarked    0
dtype: int64

## Perform Chi square test

In [15]:
from sklearn.feature_selection import chi2
p_values = chi2(X_train, y_train)
p_values

(array([21.61080949, 63.55447864,  0.94202078, 11.83961845]),
 array([3.33964360e-06, 1.55992554e-15, 3.31758690e-01, 5.79837058e-04]))

In [16]:
#convert p_values into series of data
p_values = pd.Series(p_values[1])
p_values.index=X_train.columns
p_values

Pclass      3.339644e-06
Sex         1.559926e-15
Age         3.317587e-01
Embarked    5.798371e-04
dtype: float64

In [17]:
p_values.sort_index(ascending=False)

Sex         1.559926e-15
Pclass      3.339644e-06
Embarked    5.798371e-04
Age         3.317587e-01
dtype: float64

**Here, column 'Sex' is the most important column when compares to output feature 'Survived'**