# Chi Square

In [19]:
import pandas as pd
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from numpy import array

In [3]:
df = pd.read_csv("data.csv")
df

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities
0,RL,Pave,Reg,Lvl,AllPub
1,RL,Pave,Reg,Lvl,AllPub
2,RL,Pave,IR1,Lvl,AllPub
3,RL,Pave,IR1,Lvl,AllPub
4,RL,Pave,IR1,Lvl,AllPub
...,...,...,...,...,...
1455,RL,Pave,Reg,Lvl,AllPub
1456,RL,Pave,Reg,Lvl,AllPub
1457,RL,Pave,Reg,Lvl,AllPub
1458,RL,Pave,Reg,Lvl,AllPub


In [4]:
df.nunique()

MSZoning       5
Street         2
LotShape       4
LandContour    4
Utilities      2
dtype: int64

In [5]:
df.isna().sum()

MSZoning       0
Street         0
LotShape       0
LandContour    0
Utilities      0
dtype: int64

In [6]:
enc = OrdinalEncoder()
#We encode categorical data into numerical one
enc.fit(df[['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities']])
df[['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities']] = enc.transform(df[['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities']])
df.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities
0,3.0,1.0,3.0,3.0,0.0
1,3.0,1.0,3.0,3.0,0.0
2,3.0,1.0,0.0,3.0,0.0
3,3.0,1.0,0.0,3.0,0.0
4,3.0,1.0,0.0,3.0,0.0


In [7]:

df.nunique()

MSZoning       5
Street         2
LotShape       4
LandContour    4
Utilities      2
dtype: int64

In [8]:
X = df.iloc[:, 0:4]
y = df.iloc[:, -1]
X

Unnamed: 0,MSZoning,Street,LotShape,LandContour
0,3.0,1.0,3.0,3.0
1,3.0,1.0,3.0,3.0
2,3.0,1.0,0.0,3.0
3,3.0,1.0,0.0,3.0
4,3.0,1.0,0.0,3.0
...,...,...,...,...
1455,3.0,1.0,3.0,3.0
1456,3.0,1.0,3.0,3.0
1457,3.0,1.0,3.0,3.0
1458,3.0,1.0,3.0,3.0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
f_score = chi2(X_train, y_train)
f_score

(array([1.98041073e-04, 2.40764978e-05, 1.92654261e+00, 1.58801266e-02]),
 array([0.98877198, 0.99608497, 0.1651372 , 0.89971896]))

In [10]:
pvalues = pd.Series(f_score[1])
pvalues

0    0.988772
1    0.996085
2    0.165137
3    0.899719
dtype: float64

In [11]:
pvalues.index = X_train.columns
pvalues

MSZoning       0.988772
Street         0.996085
LotShape       0.165137
LandContour    0.899719
dtype: float64

In [12]:
pvalues.sort_values(ascending=False)

Street         0.996085
MSZoning       0.988772
LandContour    0.899719
LotShape       0.165137
dtype: float64

In [13]:
#We select those features whoes p-value is smaller
#As a practice we select features whoes values are <0.05
#

# SelectKBest 

In [31]:

selector = SelectKBest(score_func=chi2, k=2)
selector.fit(X_train, y_train)
cols_idxs = selector.get_support(indices=True)
X_new = X_train[X_train.columns[cols_idxs]] # final features`
X_new

Unnamed: 0,LotShape,LandContour
64,3.0,3.0
682,3.0,1.0
960,0.0,3.0
1384,3.0,3.0
1100,3.0,0.0
...,...,...
763,3.0,3.0
835,3.0,3.0
1216,3.0,3.0
559,3.0,3.0
