## **Step 3: Feature Selection**
<br>

> #### By: Atharv Chaudhari

### Data Loading

In [1]:
import pandas as pd

In [2]:
manipulated_data=pd.read_csv("/kaggle/input/mp-data-visualization/manipulated_data.csv")
manipulated_data.head()

Unnamed: 0,year,month,day,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,gender,abroad,contact_with_covid_object,contact_with_covid_patient,corona_result
0,2020,11,12,0,0,0,0,0,0,0,0,1,0,0
1,2020,11,12,0,1,0,0,0,0,0,0,1,0,0
2,2020,11,12,0,0,0,0,0,1,1,0,1,0,0
3,2020,11,12,0,1,0,0,0,0,0,0,0,1,0
4,2020,11,12,1,0,0,0,0,0,0,0,1,0,0


In [3]:
manipulated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42916 entries, 0 to 42915
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   year                        42916 non-null  int64
 1   month                       42916 non-null  int64
 2   day                         42916 non-null  int64
 3   cough                       42916 non-null  int64
 4   fever                       42916 non-null  int64
 5   sore_throat                 42916 non-null  int64
 6   shortness_of_breath         42916 non-null  int64
 7   head_ache                   42916 non-null  int64
 8   age_60_and_above            42916 non-null  int64
 9   gender                      42916 non-null  int64
 10  abroad                      42916 non-null  int64
 11  contact_with_covid_object   42916 non-null  int64
 12  contact_with_covid_patient  42916 non-null  int64
 13  corona_result               42916 non-null  int64
dtypes: int

### Chi-Square Test for Feature Selection

![Chi- square score](https://media.geeksforgeeks.org/wp-content/uploads/Capture-214.png)

> **1.** **Observed frequency** = No. of observations of class
>
> **2.** **Expected frequency** = No. of expected observations of class if there was no relationship between the feature and the target.

#### Dropping Features 'month','day' and 'year'

In [4]:
manipulated_data.drop(columns=['month','day','year'],inplace=True)

> #### Since its categorical data chi-square test is more preferable

### Creating Function of Chi-Square test

In [5]:
import scipy.stats as stats
from scipy.stats import chi2_contingency

imp_var=[]
h=dict()
u=dict()
unimp_var=[]

class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #This is The P-Value
        self.chi2 = None #Here we go for Chi Test Statistic
        self.dof = None
        
        self.dfTabular = None
        self.dfExpected = None
        
    def _print_chisquare_result(self, colX, alpha):
        if self.p<alpha:
          h[colX]=self.p
        else:
          unimp_var.append(colX)
          u[colX]=self.p
            
    def TestIndependence(self,colX,colY, alpha=0.05):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX, alpha)

### Applying Feature Selection on Manipulated Data

In [6]:
cT = ChiSquare(manipulated_data)
testColumns = manipulated_data.drop(columns=['corona_result']).columns

for var in testColumns:
    cT.TestIndependence(colX=var,colY="corona_result" )  

### Lets Check For Important and Unimporatant Features

In [7]:
imp=pd.DataFrame()
m = sorted(h.items(), key=lambda x: x[1], reverse=False)
for i in range(len(m)):
  imp_var.append(m[i][0])
if(len(imp_var)>len(unimp_var)):
  for i in range(len(imp_var)-len(unimp_var)):
    unimp_var.append("")
else:
  for i in range(len(unimp_var)-len(imp_var)):
    imp_var.append("")
imp['Important Variables']=imp_var
imp['Unimportant Variables']=unimp_var
imp

Unnamed: 0,Important Variables,Unimportant Variables
0,abroad,gender
1,shortness_of_breath,
2,head_ache,
3,sore_throat,
4,age_60_and_above,
5,contact_with_covid_object,
6,contact_with_covid_patient,
7,fever,
8,cough,


### Removing Unimportant Features

In [8]:
manipulated_data.drop(['gender'],axis=1,inplace=True)
manipulated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42916 entries, 0 to 42915
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   cough                       42916 non-null  int64
 1   fever                       42916 non-null  int64
 2   sore_throat                 42916 non-null  int64
 3   shortness_of_breath         42916 non-null  int64
 4   head_ache                   42916 non-null  int64
 5   age_60_and_above            42916 non-null  int64
 6   abroad                      42916 non-null  int64
 7   contact_with_covid_object   42916 non-null  int64
 8   contact_with_covid_patient  42916 non-null  int64
 9   corona_result               42916 non-null  int64
dtypes: int64(10)
memory usage: 3.3 MB


### Output CSV

In [9]:
manipulated_data.to_csv("fs_data.csv",index=False)

### Conclusions

> 1. *Since The Data was Categorical so we have used Chi-Sqaure Test For Feature Selection.*
> 2. *After performing chi square test, non-important feature stated was: gender*