Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, RFE

Load dataset

In [10]:
df = pd.read_csv("dataset.csv",index_col=0)

In [11]:
df.describe()

Unnamed: 0,x_1,x_109,x_113,x_12,x_124,x_127,x_144,x_16,x_18,x_2,...,x_73,x_77,x_78,x_8,x_80,x_83,x_89,x_91,x_99,y
count,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,...,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0
mean,5166.469289,0.563244,3.048798,503.509832,50.483855,-0.122845,3.609128,34.826657,960.42219,4.993202,...,-0.03234,1050.474387,16.507405,256.788055,-0.133831,248.378733,0.000371,-0.006766,-0.037985,0.109493
std,73.588512,0.496044,2.544466,291.454491,28.862659,1.468947,1.825079,20.441569,191.922786,2.897798,...,0.289226,408.389794,9.554037,254.703736,0.572124,145.247366,0.044075,0.15624,0.380439,0.312294
min,4964.0,0.0,0.0,0.0,0.0,-3.0,1.0,0.0,0.0,0.0,...,-0.62,49.0,0.0,0.0,-0.31,0.0,-0.18,-0.13,-0.66,0.0
25%,5099.0,0.0,1.0,250.0,26.0,-2.0,1.0,17.0,999.0,3.0,...,0.11,766.5,8.0,103.0,-0.31,122.0,0.02,-0.1,-0.49,0.0
50%,5191.0,1.0,5.0,497.0,50.0,1.0,5.0,35.0,999.0,5.0,...,0.11,1048.0,16.0,181.0,-0.31,249.0,0.02,-0.1,0.07,0.0
75%,5228.0,1.0,6.0,760.5,76.0,1.0,5.0,53.0,999.0,8.0,...,0.11,1332.5,25.0,317.0,-0.31,374.0,0.02,0.24,0.21,0.0
max,5228.0,1.0,6.0,999.0,100.0,1.0,5.0,70.0,999.0,10.0,...,0.11,2077.0,33.0,3643.0,2.69,500.0,0.02,0.39,0.93,1.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4119 entries, 0 to 4118
Data columns (total 41 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x_1     4119 non-null   int64  
 1   x_109   4119 non-null   int64  
 2   x_113   4119 non-null   int64  
 3   x_12    4119 non-null   int64  
 4   x_124   4119 non-null   int64  
 5   x_127   4119 non-null   int64  
 6   x_144   4119 non-null   int64  
 7   x_16    4119 non-null   int64  
 8   x_18    4119 non-null   int64  
 9   x_2     4119 non-null   int64  
 10  x_25    4119 non-null   int64  
 11  x_27    4119 non-null   int64  
 12  x_28    4119 non-null   int64  
 13  x_30    4119 non-null   int64  
 14  x_33    4119 non-null   float64
 15  x_38    4119 non-null   float64
 16  x_42    4119 non-null   int64  
 17  x_44    4119 non-null   int64  
 18  x_45    4119 non-null   float64
 19  x_52    4119 non-null   int64  
 20  x_55    4119 non-null   float64
 21  x_57    4119 non-null   float64
 22  x_59 

Assuming target variable is named 'y'

In [3]:
X = df.drop('y', axis=1)
y = df['y']

Spliting the dataset into training and testing sets


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Feature Selection using Filter Method (SelectKBest with ANOVA F-statistic)

In [5]:
k_best = SelectKBest(score_func=f_classif, k=20)
X_train_filtered = k_best.fit_transform(X_train, y_train)
X_test_filtered = k_best.transform(X_test)

Feature Selection using Wrapper Method (Recursive Feature Elimination - RFE)

In [6]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=rf_classifier, n_features_to_select=20, step=1)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

Create a DataFrame to store the results

In [7]:
result_filter = pd.DataFrame({'Variable': X.columns, 'Mark': k_best.get_support().astype(int)})

Save the DataFrame to a CSV file

In [8]:
result_filter.to_csv("Intellicker_IITB_VAR.csv", index=False)
print(result_filter)

   Variable  Mark
0       x_1     1
1     x_109     0
2     x_113     0
3      x_12     0
4     x_124     0
5     x_127     1
6     x_144     1
7      x_16     0
8      x_18     1
9       x_2     0
10     x_25     1
11     x_27     1
12     x_28     1
13     x_30     0
14     x_33     1
15     x_38     1
16     x_42     0
17     x_44     0
18     x_45     1
19     x_52     1
20     x_55     0
21     x_57     1
22     x_59     0
23     x_60     1
24     x_62     0
25     x_64     1
26     x_67     0
27     x_68     0
28      x_7     0
29     x_70     0
30     x_71     1
31     x_73     1
32     x_77     0
33     x_78     0
34      x_8     1
35     x_80     1
36     x_83     0
37     x_89     0
38     x_91     1
39     x_99     1
