# Import statements

In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import Perceptron
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
# from hyperopt import hp

import warnings
warnings.filterwarnings("ignore")

# Loading the Dataset

In [3]:
# Load Dataset
df = pd.read_csv('final_features.csv')
df.shape

(15, 95)

# Train Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=['Label','Image_ID'], axis=1),
    df['Label'],
    test_size=0.2,
    random_state=23)

X_train.shape, X_test.shape

((12, 93), (3, 93))

In [5]:
from sklearn.preprocessing import StandardScaler
def scale_data(dataset):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(dataset)
    return scaled_data

In [6]:
scaled_train_set = scale_data(X_train)

scaled_test_set = scale_data(X_test)

In [7]:
def cross_validation(model, _X, _y, _cv=5):
    _scoring = ['f1']
    results = cross_validate(estimator=model,
                             X=_X,
                             y=_y,
                             cv=_cv,
                             scoring=_scoring,
                             return_train_score=True)
      
    return {
              "Mean Validation F1 Score": results['test_f1'].mean()
              }

# Classifier Models

# Logistic Regression

In [8]:
LogisticReg = LogisticRegression(max_iter = 3000000)
LogisticReg.fit(scaled_train_set,y_train)

### Base Model Score

In [9]:
LogisticReg.score(scaled_test_set,y_test)

0.3333333333333333

### Parameters for Logistic Regression Model

In [10]:
LogisticReg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 3000000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

### HyperParameter Tuning for Logistic Regression Model
* RandomSearch CV method
* GridSearch CV method

In [11]:
LRgrid = {"C":[100, 10, 1.0, 0.1, 0.01],
          "class_weight":[None,'balanced'],
          "penalty":['l2','l1',None,'elasticnet'],
          "solver":['newton-cg', 'newton-cholesky','lbfgs','liblinear','sag','saga']
          }
RCV_LRReg = RandomizedSearchCV(estimator = LogisticReg,
                              param_distributions = LRgrid,
                              n_iter = 20,
                              scoring = 'f1',
                              n_jobs = -1,
                              cv = 5)
RCV_LRReg.fit(scaled_train_set,y_train)

In [12]:
RCV_LRReg.best_params_

{'solver': 'newton-cg', 'penalty': 'l2', 'class_weight': None, 'C': 10}

In [13]:
RCV_LRReg.score(scaled_test_set,y_test)

0.5

In [14]:
LRgrid_ = {'C':[1.0,10.0],
           'penalty':['l1','l2','elasticnet',None],
           'class_weight':[None,'balanced'],
           'solver':['newton-cg','lbfgs','liblinear']
           }
GCV_LRReg = GridSearchCV(estimator = LogisticReg,
                         param_grid = LRgrid_,
                         scoring = 'f1',
                         n_jobs = -1,
                         cv = 5)
GCV_LRReg.fit(scaled_train_set,y_train)

In [15]:
GCV_LRReg.best_params_

{'C': 1.0, 'class_weight': None, 'penalty': 'l2', 'solver': 'newton-cg'}

### Model score after Hyperparameter tuning

In [16]:
GCV_LRReg.score(scaled_test_set,y_test)

0.5

## Decision Trees

In [17]:
DTClr = DecisionTreeClassifier()
DTClr.fit(scaled_train_set,y_train)

In [18]:
DTClr.score(scaled_test_set,y_test)

0.6666666666666666

In [19]:
DTClr.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [20]:
DTClrgrid = {"ccp_alpha":[0.0,1.0,3.5],
             "class_weight":[None,'balanced'],
             "criterion":['gini','entropy','log_loss'],
             "min_impurity_decrease":[0.0,2.1,1.1],
             "max_depth":[3,2,4,5,None],
             "max_features":[None,'auto','sqrt','log2'],
             "min_samples_split":[1,2,4,5],
             "min_samples_leaf":[1,2,4]
            }
RCV_DTClr = RandomizedSearchCV(estimator = DTClr,
                              param_distributions = DTClrgrid,
                              n_iter = 15,
                              scoring = 'f1',
                              n_jobs = -1,
                              cv = 5)
RCV_DTClr.fit(scaled_train_set,y_train)

In [21]:
RCV_DTClr.best_params_

{'min_samples_split': 4,
 'min_samples_leaf': 1,
 'min_impurity_decrease': 2.1,
 'max_features': 'sqrt',
 'max_depth': 3,
 'criterion': 'log_loss',
 'class_weight': None,
 'ccp_alpha': 1.0}

In [22]:
RCV_DTClr.score(scaled_test_set,y_test)

0.5

In [23]:
DTClrgrid_ = {"ccp_alpha":[0.0,1.0],
             "class_weight":['balanced',None],
             "criterion":['log_loss','entropy'],
             "min_impurity_decrease":[0.0],
             "max_depth":[2,4,None],
             "max_features":['sqrt','log2'],
             "min_samples_split":[2,4],
             "min_samples_leaf":[1,3]
            }
GCV_DTClr = GridSearchCV(estimator = DTClr,
                         param_grid = DTClrgrid_,
                         scoring = 'f1',
                         cv = 5,
                         )
GCV_DTClr.fit(scaled_train_set,y_train)

In [24]:
GCV_DTClr.best_params_

{'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 2,
 'max_features': 'log2',
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 3,
 'min_samples_split': 2}

In [25]:
GCV_DTClr.score(scaled_test_set,y_test)

0.0

## Random Forest

In [26]:
RFClr = RandomForestClassifier()
RFClr.fit(scaled_train_set,y_train)

In [27]:
RFClr.score(scaled_test_set,y_test)

0.3333333333333333

In [28]:
RFClr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [29]:
RFClrgrid = {"n_estimators": [10,100,150,200],
             "min_weight_fraction_leaf":[0.0,0.5,2.1],
             "max_leaf_nodes":[None,2,5,10],
             "criterion":['gini','entropy','log_loss'],
             "max_depth":[None,5,10],
             "max_features":['sqrt','log2',None],
             "min_samples_split":[1,2,4,7],
             "min_samples_leaf":[1,2,3]}
RCV_RFClr = RandomizedSearchCV(estimator = RFClr,
                              param_distributions = RFClrgrid,
                              n_iter = 20,
                              n_jobs = -1,
                              scoring = 'f1',
                              cv = 5)
RCV_RFClr.fit(scaled_train_set,y_train)

In [30]:
RCV_RFClr.best_params_

{'n_estimators': 100,
 'min_weight_fraction_leaf': 0.5,
 'min_samples_split': 7,
 'min_samples_leaf': 3,
 'max_leaf_nodes': None,
 'max_features': 'sqrt',
 'max_depth': 5,
 'criterion': 'log_loss'}

In [31]:
RCV_RFClr.score(scaled_test_set,y_test)

0.5

In [32]:
RFClrgrid_ = {"n_estimators": [10,200],
             "min_weight_fraction_leaf":[0.0],
             "max_leaf_nodes":[None],
             "criterion":['gini','entropy'],
             "max_depth":[None,10],
             "max_features":['log2',None],
             "min_samples_split":[1,4],
             "min_samples_leaf":[1,3]}

GCV_RFClr = GridSearchCV(estimator = RFClr,
                         param_grid = RFClrgrid_,
                         cv = 5,
                         n_jobs = -1,
                         scoring = 'f1')
GCV_RFClr.fit(scaled_train_set,y_train)

In [33]:
GCV_RFClr.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10}

In [34]:
GCV_RFClr.score(scaled_test_set,y_test)

0.5

## Support Vector Machines (SVM)

In [35]:
SVM = SVC().fit(scaled_train_set,y_train)

In [36]:
SVM.score(scaled_test_set,y_test)

0.3333333333333333

In [37]:
#get parameters
SVM.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [38]:
#SVM
from skopt import Optimizer
from skopt import BayesSearchCV 
from skopt.space import Real, Categorical, Integer
rf_params = {
    'C': Real(0.01,50),
    "kernel":['linear','poly','rbf','sigmoid'],
}
clf = SVC(gamma='scale')
Bayes = BayesSearchCV(clf, rf_params,cv=3,n_iter=20, n_jobs=-1,scoring='accuracy')
Bayes.fit(scaled_train_set,y_train)

ModuleNotFoundError: No module named 'skopt'

In [39]:
Bayes.best_params_

NameError: name 'Bayes' is not defined

In [40]:
SVM_opt = SVC(kernel = 'rbf', C = 12.140873261631244)
SVM_opt.fit(scaled_train_set,y_train)

In [41]:
SVM_opt.score(scaled_test_set,y_test)

0.3333333333333333

## Perceptron

In [42]:
PClr = Perceptron(tol=1e-3, random_state=0)
PClr.fit(scaled_test_set,y_test)

In [43]:
PClr.score(scaled_test_set,y_test)

1.0

In [44]:
PClr.get_params()

{'alpha': 0.0001,
 'class_weight': None,
 'early_stopping': False,
 'eta0': 1.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'n_jobs': None,
 'penalty': None,
 'random_state': 0,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [45]:
PClrgrid = {'max_iter':[1,10,100,1000,10000],
            'penalty':['l2','l1','elasticnet',None],
            'class_weight':['balanced',None],
            'alpha':[0.0001,0.01,0.1],
            'n_jobs':[-1]
           }
RCV_PClr = RandomizedSearchCV(estimator = PClr,
                              param_distributions = PClrgrid,
                              n_iter = 20,
                              scoring = 'f1',
                              n_jobs = -1,
                              cv = 5)
RCV_PClr.fit(scaled_train_set,y_train)

In [46]:
RCV_PClr.best_params_

{'penalty': 'l1',
 'n_jobs': -1,
 'max_iter': 10,
 'class_weight': None,
 'alpha': 0.1}

In [47]:
RCV_PClr.score(scaled_test_set,y_test)

0.0

In [48]:
PClrgrid_ = {'max_iter':[10,100,1000],
            'penalty':['l2','l1',None],
            'class_weight':['balanced',None],
            'alpha':[0.0001,0.01],
            'n_jobs':[-1]
            }
GCV_PClr = GridSearchCV(estimator = PClr,
                         param_grid = PClrgrid_,
                         cv = 5,
                         n_jobs = -1,
                         scoring = 'f1')
GCV_PClr.fit(scaled_train_set,y_train)

In [49]:
GCV_PClr.best_params_

{'alpha': 0.01,
 'class_weight': 'balanced',
 'max_iter': 10,
 'n_jobs': -1,
 'penalty': 'l1'}

In [50]:
GCV_PClr.score(scaled_test_set,y_test)

1.0

In [60]:
import pickle
# create an iterator object with write permission - model.pkl
with open('model_pkl', 'wb') as files:
    pickle.dump(GCV_PClr, files)

In [70]:
model = pickle.load(open('model_pkl', 'rb'))
model.score(scaled_test_set,y_test)

1.0

# Result 

In [52]:
# After considering all five conventional classification models, we come to conclusion that Random forest is the best model.

# Feature Selection

In [53]:
def RF(selected_features):
    RF = RandomForestClassifier(criterion = 'entropy',max_depth = None, max_features = 'log2', max_leaf_nodes = None,
                               min_samples_leaf = 1,min_samples_split = 4, min_weight_fraction_leaf = 0.0, n_estimators = 200)
    RF_result = cross_validation(RF, selected_features,y_test, 5)
    print(RF_result)

In [54]:
def cross_validation(model, _X, _y, _cv=5):
      _scoring = ['f1']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=_scoring,
                               return_train_score=True)
      
      return {
              "Mean Validation F1 Score": results['test_f1'].mean()
              }

# Correlation-based feature selection (CFS)

In [55]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
# X is your feature matrix, y is your target variable
def CFS(features):
    selector = SelectKBest(score_func=mutual_info_classif, k=features) # choose the number of features you want to keep
    CFS = selector.fit_transform(scaled_train_set, y_train)
    colu = list(X_test.columns[selector.get_support()])
    new_dataframe = X_test.filter(items=colu)
    print('The result for Random Forest')
    RF(new_dataframe)

In [59]:
for i in range(5,95,5):
    print("The number of features selected : ",i)
    CFS(i)

The number of features selected :  5
The result for Random Forest


ValueError: Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=3.

In [10]:
for i in range(90,94):
    print("The number of features selected : ",i)
    CFS(i)

The number of features selected :  90
The result for Random Forest
{'Mean Validation F1 Score': 0.9259616975502982}
The number of features selected :  91
The result for Random Forest
{'Mean Validation F1 Score': 0.9217534319812177}
The number of features selected :  92
The result for Random Forest
{'Mean Validation F1 Score': 0.9229386487163408}
The number of features selected :  93
The result for Random Forest
{'Mean Validation F1 Score': 0.9223045748646614}


In [143]:
#The max f1-score is for 90 features

# Principal component analysis (PCA)

In [39]:
from sklearn.decomposition import PCA

In [40]:
def PCA_sel(features):
    pca_sel = PCA(n_components = features)

    pct = pca_sel.fit_transform(X_test)
    RF(pct)
#     print(pct)

In [41]:
for i in range(5,95,5):
    print("The number of features selected : ",i)
    PCA_sel(i)
    

The number of features selected :  5
{'Mean Validation F1 Score': 0.8955510055281414}
The number of features selected :  10
{'Mean Validation F1 Score': 0.9105396409698894}
The number of features selected :  15
{'Mean Validation F1 Score': 0.9099626578535961}
The number of features selected :  20
{'Mean Validation F1 Score': 0.9112547442821001}
The number of features selected :  25
{'Mean Validation F1 Score': 0.9129278263755118}
The number of features selected :  30
{'Mean Validation F1 Score': 0.9090332511300627}
The number of features selected :  35
{'Mean Validation F1 Score': 0.9099051619807745}
The number of features selected :  40
{'Mean Validation F1 Score': 0.9157095498144342}
The number of features selected :  45
{'Mean Validation F1 Score': 0.9118611819686647}
The number of features selected :  50
{'Mean Validation F1 Score': 0.9120949508842486}
The number of features selected :  55
{'Mean Validation F1 Score': 0.907819471195746}
The number of features selected :  60
{'Mean 

In [None]:
# The highest f-score is for 45 features

# L1 regularization

In [37]:
l1 = SelectFromModel(LogisticRegression(C=1, penalty='l1',solver='liblinear'))
l1.fit(scaled_train_set, y_train)

In [41]:
l1.get_support()
colu = X_train.columns[l1.get_support()]
colu = list(colu)
len(colu)

50

In [38]:
l1.get_support()


array([ True, False,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
       False,  True, False,  True,  True,  True,  True, False,  True,
        True,  True,  True, False, False, False, False,  True,  True,
       False,  True, False, False,  True, False, False, False, False,
       False,  True, False,  True, False, False,  True,  True, False,
       False, False, False, False, False, False,  True,  True, False,
       False,  True, False, False,  True,  True,  True, False, False,
       False,  True,  True, False, False, False, False,  True,  True,
        True,  True,  True])

In [22]:
new = X_test.filter(items=colu)
new.head()

Unnamed: 0,10Percentile,Energy,Entropy,InterquartileRange,Kurtosis,Maximum,MeanAbsoluteDeviation,Mean,Median,RobustMeanAbsoluteDeviation,...,LargeAreaHighGrayLevelEmphasis,LargeAreaLowGrayLevelEmphasis,SmallAreaEmphasis,SmallAreaHighGrayLevelEmphasis,SmallAreaLowGrayLevelEmphasis,Busyness,Coarseness,Complexity,Contrast.1,Strength
4160,-90.7,3198265,1.414487,18.0,2.567667,-40,11.043524,-72.710616,-73.0,7.756323,...,9.896353,0.25104,0.915347,6.965824,0.15776,0.803391,0.131118,0.254796,0.002927,0.565068
3417,-125.0,4583414,2.287683,234.0,1.09523,127,111.599841,9.464789,84.0,108.735571,...,89.650943,0.231531,0.920707,64.541317,0.158826,0.366062,0.039266,23.822087,0.347828,3.505375
1306,-114.0,4219550,2.439997,44.75,7.166517,126,37.534175,-64.281139,-77.0,18.772392,...,24.226843,0.140615,0.954369,20.93578,0.117925,0.399729,0.079289,6.602618,0.021005,5.5116
1792,-107.0,5211017,1.618324,23.0,34.25168,127,15.294723,-83.902549,-85.0,9.668807,...,14.299517,0.154904,0.947729,10.902802,0.124966,0.672917,0.052802,4.038046,0.005648,6.526806
2880,-112.0,4449091,2.368194,42.0,7.515471,127,36.058207,-63.3711,-76.0,17.808779,...,29.012635,0.154196,0.933426,18.989884,0.110968,0.698312,0.041129,15.558533,0.046723,2.419463


In [23]:
RF(new)

{'Mean Validation F1 Score': 0.920564397949059}


# Random forest feature importance

In [17]:
RF = RandomForestClassifier(criterion = 'entropy',max_depth = None, max_features = 'log2', max_leaf_nodes = None,
                               min_samples_leaf = 1,min_samples_split = 4, min_weight_fraction_leaf = 0.0, n_estimators = 200)
RF.fit(scaled_train_set,y_train)
rfc_ = SelectFromModel(RF)
rfc_.fit(scaled_train_set, y_train)

In [36]:
RF.predict(X_test)

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [21]:
y_test.head()

4160    1
3417    1
1306    0
1792    1
2880    1
Name: Label, dtype: int64

In [32]:
colu =list(X_test.columns[rfc_.get_support()])
new_dataframe = X_test.filter(items = colu)
new_dataframe.head()

Unnamed: 0,Energy,Entropy,InterquartileRange,TotalEnergy,Uniformity,Information_Measure_Correlation2,InverseVariance,JointEnergy,JointEntropy,MaximumProbability,...,DependenceNonUniformity,GrayLevelNonUniformity,GrayLevelNonUniformity.1,GrayLevelNonUniformityNormalized,RunEntropy,RunLengthNonUniformity,GrayLevelNonUniformity.2,GrayLevelNonUniformityNormalized.1,SizeZoneNonUniformity,ZoneEntropy
4160,3198265,1.414487,18.0,3198265,0.415703,0.89913,0.16,0.349867,1.907467,0.44,...,390.15411,242.770548,213.299424,0.409404,1.96391,417.072937,213.299424,0.409404,417.072937,1.96391
3417,4583414,2.287683,234.0,4583414,0.25683,0.95615,0.152322,0.181203,3.245128,0.339623,...,241.11831,91.174648,79.345912,0.249515,2.835445,258.27044,79.345912,0.249515,258.27044,2.835445
1306,4219550,2.439997,44.75,4219550,0.217145,0.932916,0.394486,0.115771,3.67013,0.237288,...,446.807829,122.035587,113.196597,0.213982,2.779614,468.754253,113.196597,0.213982,468.754253,2.779614
1792,5211017,1.618324,23.0,5211017,0.377076,0.675749,0.276353,0.22466,2.594043,0.358209,...,511.977511,251.509745,232.909823,0.375056,1.994389,540.819646,232.909823,0.375056,540.819646,1.994389
2880,4449091,2.368194,42.0,4449091,0.224263,0.968533,0.24829,0.103253,3.798234,0.2,...,435.633826,136.576355,124.833935,0.225332,2.828162,465.389892,124.833935,0.225332,465.389892,2.828162


In [33]:
RF(new_dataframe)

{'Mean Validation F1 Score': 0.9125115808198435}


# Mutual information

In [105]:
# https://github.com/anujdutt9/Feature-Selection-for-Machine-Learning/blob/master/Filter%20Methods/Mutual-Information.ipynb

In [95]:
mutual_info = mutual_info_classif(X_train, y_train)

In [34]:
for i in range(5,95,5):
    k_best_features = SelectKBest(mutual_info_classif, k=i).fit(X_train, y_train)
    colu =list(X_test.columns[k_best_features.get_support()])
    new_dataframe = X_test.filter(items = colu)
    print("The number of features selected : ", i)
    RF(new_dataframe)

The number of features selected :  5
{'Mean Validation F1 Score': 0.9017252334938597}
The number of features selected :  10
{'Mean Validation F1 Score': 0.9007222573891152}
The number of features selected :  15
{'Mean Validation F1 Score': 0.9106982456841484}
The number of features selected :  20
{'Mean Validation F1 Score': 0.9202802540731593}
The number of features selected :  25
{'Mean Validation F1 Score': 0.9175084080309045}
The number of features selected :  30
{'Mean Validation F1 Score': 0.9218996173236127}
The number of features selected :  35
{'Mean Validation F1 Score': 0.9199465499403479}
The number of features selected :  40
{'Mean Validation F1 Score': 0.9194502332343006}
The number of features selected :  45
{'Mean Validation F1 Score': 0.9211099391608141}
The number of features selected :  50
{'Mean Validation F1 Score': 0.9194683718955208}
The number of features selected :  55
{'Mean Validation F1 Score': 0.9211357174184075}
The number of features selected :  60
{'Mean

In [104]:
# The Max value is for 70 features 

# Recursive Feature Elimination (RFE)

In [16]:
from sklearn.feature_selection import RFE


In [17]:
def RFE_sel(features):
    sel = RFE(RandomForestClassifier(), n_features_to_select = features)
    sel.fit(X_train, y_train)
    colu = list(X_test.columns[sel.get_support()])
    new_dataframe = X_test.filter(items=colu)
    print('The result for Random Forest')
    RF(new_dataframe)

In [18]:
for i in range(5,95,5):
    print("The number of features selected : ",i)
    RFE_sel(i)
    

The number of features selected :  5
The result for Random Forest
{'Mean Validation F1 Score': 0.9051509213788457}
The number of features selected :  10
The result for Random Forest
{'Mean Validation F1 Score': 0.9166055165379522}
The number of features selected :  15
The result for Random Forest
{'Mean Validation F1 Score': 0.9202353962220734}
The number of features selected :  20
The result for Random Forest
{'Mean Validation F1 Score': 0.9194758658458072}
The number of features selected :  25
The result for Random Forest
{'Mean Validation F1 Score': 0.9166933717257081}
The number of features selected :  30
The result for Random Forest
{'Mean Validation F1 Score': 0.9183833894387071}
The number of features selected :  35
The result for Random Forest
{'Mean Validation F1 Score': 0.920599631294395}
The number of features selected :  40
The result for Random Forest
{'Mean Validation F1 Score': 0.9228942441788697}
The number of features selected :  45
The result for Random Forest
{'Mean 

In [36]:
colu =list(X_test.columns[sel.get_support()])
new_dataframe = X_test.filter(items = colu)
new_dataframe.head()

Unnamed: 0,Energy,Entropy,InterquartileRange,Mean,RootMeanSquared,Skewness,TotalEnergy,Uniformity,Variance,ClusterProminence,...,GrayLevelNonUniformityNormalized.1,GrayLevelVariance.2,HighGrayLevelZoneEmphasis,SizeZoneNonUniformity,SmallAreaEmphasis,ZoneEntropy,ZoneVariance,Busyness,Complexity,Strength
4160,3198265,1.414487,18.0,-72.710616,74.003251,0.181011,3198265,0.415703,189.647422,3.776562,...,0.409404,0.440501,7.497121,417.072937,0.915347,1.96391,0.129332,0.803391,0.254796,0.565068
3417,4583414,2.287683,234.0,9.464789,113.626693,-0.208926,4583414,0.25683,12821.44313,5365.604429,...,0.249515,20.1415,69.185535,258.27044,0.920707,2.835445,0.134261,0.366062,23.822087,3.505375
1306,4219550,2.439997,44.75,-64.281139,86.649271,2.132672,4219550,0.217145,3376.031281,1506.061458,...,0.213982,5.65767,21.551985,468.754253,0.954369,2.779614,0.062271,0.399729,6.602618,5.5116
1792,5211017,1.618324,23.0,-83.902549,88.389022,4.668298,5211017,0.377076,772.981508,48.598213,...,0.375056,1.435179,11.478261,540.819646,0.947729,1.994389,0.087911,0.672917,4.038046,6.526806
2880,4449091,2.368194,42.0,-63.3711,85.472616,2.215709,4449091,0.224263,3289.671808,3323.775231,...,0.225332,5.009527,20.722022,465.389892,0.933426,2.828162,0.118303,0.698312,15.558533,2.419463


In [37]:
RF(new_dataframe)

{'Mean Validation F1 Score': 0.9212763665837155}


In [None]:
# https://github.com/krishnadulal/Feature-Selection-in-Machine-Learning-using-Python-All-Code

In [None]:
import gradio as gr

import tensorflow as tf

import numpy as np

from PIL import Image

import requests

from io import BytesIO

IMG_SIZE = (224, 224)

# Load the pre-trained model

model = tf.keras.models.load_model('/kaggle/working/pneumonia_detection_model.h5')

def predict_pneumonia(img):

    pil_img = Image.fromarray(np.uint8(img)).convert("RGB")# Convert the input image to a PIL image

    pil_img = pil_img.resize(IMG_SIZE) # Resize the image

    img_array = np.array(pil_img) # Convert the image to a NumPy array

    img_array = img_array / 255.0 # Normalize the pixel values

    img_array = np.expand_dims(img_array, axis=0)# Add a batch dimension

    prediction = model.predict(img_array)[0]# Make a prediction

    if prediction < 0.5: # Determine the predicted class

        label = "Normal"

    else:

        label = "Pneumonia"

    return label

# Create a Gradio interface

iface = gr.Interface(fn=predict_pneumonia, inputs="image", outputs="text")

iface.launch()

In [None]:
https://21a57a2c3b6ac19a67.gradio.live/