In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report , f1_score ,recall_score , precision_score,accuracy_score ,confusion_matrix ,roc_curve, auc, roc_auc_score
from sklearn.model_selection import GridSearchCV
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [3]:
dataset2 = pd.read_csv("dataset2.csv",delimiter=",")
array = dataset2.values
#Split dataset
#Sample vectors
X = array[:,1:9]
# Target vector (class labels) 
y = array[:,9]
# Print the shape of the array
X.shape, y.shape

((768, 8), (768,))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2 ,random_state=42)

In [16]:
svc = SVC(kernel='linear')
model = svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
confusion_matrix(y_test, y_pred)
confusion_matrix(y_test, y_pred)
classi=classification_report(y_test,y_pred)
print(classi)
print('Accuracy: ', accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.80      0.84      0.82        99
         1.0       0.68      0.62      0.65        55

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.73       154
weighted avg       0.76      0.76      0.76       154

Accuracy:  0.7597402597402597


In [10]:
sfs = SFS(SVC(), k_features = 5, forward= True,
          floating = False, verbose= 2,  scoring= 'accuracy',  cv = 4, n_jobs= -1).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    3.4s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.4s finished

[2020-02-02 10:29:12] Features: 1/5 -- score: 0.7002951718321737[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   7 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:    0.2s finished

[2020-02-02 10:29:12] Features: 2/5 -- score: 0.7116909129243094[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s finished

[2020-02-02 10:29:13] Features: 3/5 -- score: 0.7036474804975754[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | 

In [15]:
mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500,random_state=42)
mlp.fit(X_train,y_train)
y_pred = mlp.predict(X_test)
predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)
classi=classification_report(y_test,y_pred)
print(classi)
print('Accuracy: ', accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.76      0.88      0.81        99
         1.0       0.69      0.49      0.57        55

    accuracy                           0.74       154
   macro avg       0.72      0.68      0.69       154
weighted avg       0.73      0.74      0.73       154

Accuracy:  0.7402597402597403


In [13]:
sfs = SFS(MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500,random_state=42), k_features = 7, forward= True,
          floating = False, verbose= 2,  scoring= 'accuracy',  cv = 4, n_jobs= -1).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    3.3s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    6.0s finished

[2020-02-02 10:33:07] Features: 1/7 -- score: 0.7394897744043855[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   7 | elapsed:    5.6s remaining:    4.1s
[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:    8.7s finished

[2020-02-02 10:33:16] Features: 2/7 -- score: 0.7362639679527725[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    7.3s remaining:    7.3s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   12.0s finished

[2020-02-02 10:33:28] Features: 3/7 -- score: 0.7492304448661185[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | 

In [14]:
naive = GaussianNB().fit(X_train, y_train)
y_pred = naive.predict(X_test)
confusion_matrix(y_test, y_pred)
classi=classification_report(y_test,y_pred)
print(classi)
print('Accuracy: ', accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.83      0.79      0.81        99
         1.0       0.65      0.71      0.68        55

    accuracy                           0.76       154
   macro avg       0.74      0.75      0.74       154
weighted avg       0.77      0.76      0.76       154

Accuracy:  0.7597402597402597


In [17]:
sfs = SFS(GaussianNB(), k_features = 7, forward= True,
          floating = False, verbose= 2,  scoring= 'accuracy',  cv = 4, n_jobs= -1).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.0s finished

[2020-02-02 10:33:59] Features: 1/7 -- score: 0.7459624710099093[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:    0.0s finished

[2020-02-02 10:33:59] Features: 2/7 -- score: 0.7703879401222854[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.0s finished

[2020-02-02 10:33:59] Features: 3/7 -- score: 0.7686907020872865[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | 

In [18]:
clf = KNeighborsClassifier(n_neighbors=9).fit(X_train, y_train)
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)
classi=classification_report(y_test,y_pred)
print(classi)
print('Accuracy: ', accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.80      0.73      0.76        99
         1.0       0.58      0.67      0.62        55

    accuracy                           0.71       154
   macro avg       0.69      0.70      0.69       154
weighted avg       0.72      0.71      0.71       154

Accuracy:  0.7077922077922078


In [27]:
sfs = SFS(KNeighborsClassifier(), k_features = 7, forward= True,
          floating = False, verbose= 2,  scoring= 'accuracy',  cv = 4, n_jobs= -1).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.1s finished

[2020-02-02 10:36:36] Features: 1/7 -- score: 0.6971115327851571[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:    0.0s finished

[2020-02-02 10:36:36] Features: 2/7 -- score: 0.7329116592873709[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.0s finished

[2020-02-02 10:36:36] Features: 3/7 -- score: 0.7395108581066836[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | 

In [20]:
def run_DecisionTree(X_train, X_test, y_train, y_test):
    clf = DecisionTreeClassifier(random_state=1).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    confusion_matrix(y_test, y_pred)
    classi=classification_report(y_test,y_pred)
    print(classi)
    print('Accuracy: ', accuracy_score(y_test, y_pred))

In [21]:
%%time
print("The result of Decision Tree before features selection:")
run_DecisionTree(X_train, X_test, y_train, y_test)

The result of Decision Tree before features selection:
              precision    recall  f1-score   support

         0.0       0.83      0.75      0.79        99
         1.0       0.62      0.73      0.67        55

    accuracy                           0.74       154
   macro avg       0.72      0.74      0.73       154
weighted avg       0.75      0.74      0.74       154

Accuracy:  0.7402597402597403
Wall time: 121 ms


In [22]:
for index in range(1, 11):
    sel = RFE(DecisionTreeClassifier(), n_features_to_select = index)
    sel.fit(X_train, y_train)
    sel.get_support()
    X_train_rfe = sel.transform(X_train)
    X_test_rfe = sel.transform(X_test)
    print('Selected Feature: ', index)
    run_DecisionTree(X_train_rfe, X_test_rfe, y_train, y_test)
    print()

Selected Feature:  1
              precision    recall  f1-score   support

         0.0       0.71      0.82      0.76        99
         1.0       0.55      0.40      0.46        55

    accuracy                           0.67       154
   macro avg       0.63      0.61      0.61       154
weighted avg       0.65      0.67      0.65       154

Accuracy:  0.6688311688311688

Selected Feature:  2
              precision    recall  f1-score   support

         0.0       0.74      0.71      0.73        99
         1.0       0.52      0.56      0.54        55

    accuracy                           0.66       154
   macro avg       0.63      0.64      0.63       154
weighted avg       0.66      0.66      0.66       154

Accuracy:  0.6558441558441559

Selected Feature:  3
              precision    recall  f1-score   support

         0.0       0.77      0.81      0.79        99
         1.0       0.62      0.56      0.59        55

    accuracy                           0.72       154
   

In [23]:
def run_randomForest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    confusion_matrix(y_test, y_pred)
    classi=classification_report(y_test,y_pred)
    print(classi)
    print('Accuracy: ', accuracy_score(y_test, y_pred))

In [24]:
%%time
print("The result of Ranodm forest before features selection:")
run_randomForest(X_train, X_test, y_train, y_test)

The result of Ranodm forest before features selection:
              precision    recall  f1-score   support

         0.0       0.81      0.82      0.81        99
         1.0       0.67      0.65      0.66        55

    accuracy                           0.76       154
   macro avg       0.74      0.74      0.74       154
weighted avg       0.76      0.76      0.76       154

Accuracy:  0.7597402597402597
Wall time: 591 ms


In [25]:
for index in range(1, 11):
    sel = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select = index).fit(X_train, y_train)
    sel.get_support()
    X_train_rfe = sel.transform(X_train)
    X_test_rfe = sel.transform(X_test)
    print('Selected Feature: ', index)
    run_randomForest(X_train_rfe, X_test_rfe, y_train, y_test)
    print()

Selected Feature:  1
              precision    recall  f1-score   support

         0.0       0.73      0.75      0.74        99
         1.0       0.53      0.51      0.52        55

    accuracy                           0.66       154
   macro avg       0.63      0.63      0.63       154
weighted avg       0.66      0.66      0.66       154

Accuracy:  0.6623376623376623

Selected Feature:  2
              precision    recall  f1-score   support

         0.0       0.77      0.80      0.79        99
         1.0       0.62      0.58      0.60        55

    accuracy                           0.72       154
   macro avg       0.69      0.69      0.69       154
weighted avg       0.72      0.72      0.72       154

Accuracy:  0.7207792207792207

Selected Feature:  3
              precision    recall  f1-score   support

         0.0       0.81      0.78      0.79        99
         1.0       0.63      0.67      0.65        55

    accuracy                           0.74       154
   