## **Feature Selection**

Inspired from : Atharv Chaudhari's notebook https://www.kaggle.com/atharvchaudhari/mp-feature-selection?rvi=1

In [None]:
import pandas as pd

In [None]:
df =pd.read_csv("../input/hepatitis-c-virus-blood-biomarkers/hcvdat0.csv")
df.head()

In [None]:
df.info()

### Chi-Square Test for Feature Selection

![Chi- square score](https://media.geeksforgeeks.org/wp-content/uploads/Capture-214.png)

> **1.** **Observed frequency** = No. of observations of class
>
> **2.** **Expected frequency** = No. of expected observations of class if there was no relationship between the feature and the target.

> #### Since its categorical data chi-square test is more preferable

### Creating Function of Chi-Square test

In [None]:
import scipy.stats as stats
from scipy.stats import chi2_contingency

imp_var=[]
h=dict()
u=dict()
unimp_var=[]

class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #This is The P-Value
        self.chi2 = None #Here we go for Chi Test Statistic
        self.dof = None
        
        self.dfTabular = None
        self.dfExpected = None
        
    def _print_chisquare_result(self, colX, alpha):
        if self.p<alpha:
          h[colX]=self.p
        else:
          unimp_var.append(colX)
          u[colX]=self.p
            
    def TestIndependence(self,colX,colY, alpha=0.05):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        #print('chi2' , chi2, 'p' , p, 'dof' ,dof)
        #print('dfObserved' , self.dfObserved.values , 'expected' , expected)
        
        #print()
        
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX, alpha)

### Applying Feature Selection on Manipulated Data

In [None]:
cT = ChiSquare(df)
testColumns = df.drop(columns=['Category']).columns

for var in testColumns:
    cT.TestIndependence(colX=var,colY="Category" )  

### Lets Check For Important and Unimporatant Features

In [None]:
imp=pd.DataFrame()
m = sorted(h.items(), key=lambda x: x[1], reverse=False)
for i in range(len(m)):
  imp_var.append(m[i][0])
if(len(imp_var)>len(unimp_var)):
  for i in range(len(imp_var)-len(unimp_var)):
    unimp_var.append("")
else:
  for i in range(len(unimp_var)-len(imp_var)):
    imp_var.append("")
imp['Important Variables']=imp_var
imp['Unimportant Variables']=unimp_var
imp

### Removing Unimportant Features

In [None]:
Unimp_Fea = imp['Unimportant Variables'].loc[imp['Unimportant Variables'] != ''].to_list()
Unimp_Fea

In [None]:
df.drop(Unimp_Fea ,axis=1,inplace=True)
df.info()

### Output CSV

In [None]:
df_new.to_csv("fs_data.csv",index=False)