## Univariate

## Using ANOVA for Classification and Regression

## Classification Problem

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_classif, f_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile

In [3]:
data = pd.read_csv('santander-train.csv')
data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [4]:
X = data.drop(labels=['TARGET'], axis=1)
y = data['TARGET']

In [5]:
print(X.shape)
print(y.shape)

(76020, 370)
(76020,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
### Removing Constants, Quasi Constants and Duplicate Features

In [8]:
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X_train)

VarianceThreshold(threshold=0.01)

In [9]:
X_train_filter = constant_filter.transform(X_train)
X_test_filter = constant_filter.transform(X_test)

In [10]:
print(X_train_filter.shape)
print(X_test_filter.shape)

(57015, 266)
(19005, 266)


In [11]:
X_train_T = X_train_filter.T
X_test_T = X_test_filter.T

In [12]:
print(type(X_train_T))
print(type(X_test_T))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [13]:
X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)

In [14]:
X_train_T.duplicated().sum()  ## Number of duplicate Features

17

In [15]:
duplicated_features = X_train_T.duplicated()
features_to_keep = [not index for index in duplicated_features]
X_train_unique = X_train_T[features_to_keep].T
X_test_unique = X_test_T[features_to_keep].T

In [16]:
print(X_train_unique.shape)
print(X_test_unique.shape)

(57015, 249)
(19005, 249)


In [17]:
### Applying f_classif returns a numpy array having F-score and p-value(if p-values are less than 0.05 them fearures are important) 
sel = f_classif(X_train_unique, y_train)
print(type(sel))

<class 'tuple'>


In [18]:
sel

(array([2.87445436e+00, 6.61551963e-01, 5.69066991e+02, 8.45840584e-02,
        2.98673532e+00, 1.80142083e-01, 3.11060896e-01, 9.16203203e-02,
        1.96847973e+01, 2.06837460e+01, 1.17844827e-01, 2.78692450e+00,
        2.97153529e-01, 6.24094678e+01, 2.72882366e+01, 4.72952767e+01,
        6.58597969e+01, 2.89970388e+01, 4.44738929e+01, 1.62339577e-03,
        3.48002488e-03, 6.99224120e+01, 1.12816058e+03, 1.35000746e+02,
        4.64759690e+01, 7.83118930e+01, 6.24878127e+01, 8.14105190e+01,
        6.04505747e+01, 6.18504997e+01, 8.31363281e+01, 2.00300671e+01,
        5.00157370e+01, 5.65610322e+01, 3.52431627e+01, 2.47750627e+01,
        3.50171071e+01, 2.59684586e+01, 1.36259980e+03, 1.75909706e-01,
        5.81971279e-01, 6.63406360e+01, 4.48999028e-03, 6.28529059e+01,
        6.20711799e-02, 5.10970458e+00, 3.84053909e+02, 5.18016481e+01,
        1.10885914e+03, 1.34670129e+02, 4.64759690e+01, 2.56281679e+01,
        6.22875200e+01, 7.65039021e+01, 6.01925481e+01, 6.182591

In [19]:
### Second array has p-values
p_values = sel[1]
print(p_values)

[9.00009337e-002 4.16015973e-001 3.69026568e-125 7.71179808e-001
 8.39546731e-002 6.71252767e-001 5.77032096e-001 7.62128377e-001
 9.14957990e-006 5.42852930e-006 7.31385072e-001 9.50419544e-002
 5.85674562e-001 2.83935108e-015 1.75889041e-007 6.16860216e-012
 4.93740408e-016 7.27749421e-008 2.60116459e-011 9.67860962e-001
 9.52958897e-001 6.30575816e-017 6.23639753e-245 3.57880015e-031
 9.36689441e-012 9.04483083e-019 2.72873461e-015 1.88915635e-019
 7.67075322e-015 3.77014891e-015 7.89967423e-020 7.63813985e-006
 1.54266132e-012 5.52771915e-014 2.92679938e-009 6.46130132e-007
 3.28682354e-009 3.48146781e-007 8.61444362e-295 6.74913779e-001
 4.45543835e-001 3.86966559e-016 9.46576055e-001 2.26747869e-015
 8.03252703e-001 2.37961259e-002 3.11504034e-085 6.21545315e-013
 8.11360158e-241 4.22555952e-031 9.36689441e-012 4.15237914e-007
 3.02052472e-015 2.25636963e-018 8.74395471e-015 3.81746996e-015
 2.59687670e-005 2.78595155e-005 7.30272010e-019 7.92861005e-001
 1.52241868e-001 7.570935

In [20]:
print(len(p_values))

249


In [21]:
p_values = pd.Series(p_values, index=X_train_unique.columns)

In [22]:
p_values.sort_values(inplace=True)

In [None]:
plt.figure()
p_values.plot.bar(figsize=(12, 5))

In [None]:
p_values = p_values[p_values<0.05]

In [None]:
print(len(p_values))

In [None]:
X_train_p = X_train_unique[p_values.index]
X_test_p = X_test_unique[p_values.index]
print(X_train_p.shape)
print(X_test_p.shape)

In [None]:
%%time
model = RandomForestClassifier(n_estimators=1000, max_depth= 4)
model.fit(X_train_p, y_train)
y_pred = model.predict(X_test_p)
print(accuracy_score(y_test, y_pred))

In [None]:
%%time
model = RandomForestClassifier(n_estimators=1000, max_depth= 4)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

## Similarly we can do for Regression