In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_recall_curve, average_precision_score
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [2]:
data = pd.read_csv('data.csv')
data.drop('Unnamed', axis=1, inplace=True)
replace_class = {1:1, 2:0, 3:0, 4:0, 5:0}
data['y'] = data['y'].replace(replace_class)
X = data.drop('y', axis=1)
y = data['y']
X = X.values
y = y.values

# fft
X = np.fft.rfft(X, axis=1)
X = np.abs(X)

# normalize
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

In [3]:
# KFold cross validation
import warnings

clf_list = {"xgb": xgb.XGBClassifier(learning_rate=0.3, n_estimators=300), 
            "rfc": RandomForestClassifier(criterion='gini', n_estimators=300), 
            "dtc": DecisionTreeClassifier(criterion='entropy', max_features='log2'), 
            "lr": LogisticRegression(max_iter=1000, C=3.0, penalty='l1', solver='liblinear'), 
            "svc": SVC(C=3.0, probability=True)}

# Create a KFold object
kf = KFold(n_splits=5, shuffle=True)

warnings.filterwarnings('ignore')

for clf_name, clf in clf_list.items():
    # For storing scores
    scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        scores.append(accuracy * 100.0)

    print(clf.__class__.__name__, clf_name, "Average accuracy: %.2f%%" % np.mean(scores))

warnings.filterwarnings('default')

XGBClassifier xgb Average accuracy: 98.51%
RandomForestClassifier rfc Average accuracy: 98.11%
DecisionTreeClassifier dtc Average accuracy: 95.19%
LogisticRegression lr Average accuracy: 97.63%
SVC svc Average accuracy: 98.53%


In [4]:
import torch
from torch import nn

# Convert your data into torch tensors
data = TensorDataset(torch.Tensor(X), torch.Tensor(y))

# Create DataLoaders
batch_size = 10

# Define the network architecture
class BinaryClassifier(nn.Module):
    def __init__(self, input_dim):
        super(BinaryClassifier, self).__init__()
        self.layer_1 = nn.Linear(input_dim, 64)
        self.layer_2 = nn.Linear(64, 16)
        self.layer_out = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(16)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        # x = self.dropout(x)
        x = self.layer_out(x)
        
        return torch.sigmoid(x)

# Initialize the model, optimizer and loss function
input_dim = X.shape[1]
model = BinaryClassifier(input_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCELoss()

# Define K-Fold Cross Validation
k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True)

avg_acc = []

# Start print
print('--------------------------------')

# K-Fold Cross Validation model evaluation
for fold, (train_ids, test_ids) in enumerate(kfold.split(data)):
    # Print
    print(f'FOLD {fold}')
    print('--------------------------------')
  
    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
  
    # Define data loaders for training and testing data in this fold
    trainloader = DataLoader(
                      data, 
                      batch_size=batch_size, sampler=train_subsampler)
    testloader = DataLoader(
                      data,
                      batch_size=batch_size, sampler=test_subsampler)
  
    # Train the model
    model.train()
    for epoch in range(50):  # number of epochs
        for inputs, labels in trainloader:
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward propagation
            outputs = model(inputs)
            
            # Compute loss
            loss = criterion(outputs, labels.view(-1, 1))
            
            # Backward propagation and optimization
            loss.backward()
            optimizer.step()
    
    # Evaluate the model
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in testloader:
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels.view(-1, 1)).sum().item()

    avg_acc.append(100.0 * correct / total)
    print('Accuracy for fold %d: %.2f%%' % (fold, 100.0 * correct / total))
    print('--------------------------------')

print('Finished K-Fold Cross Validation')
print('Average accuracy: %.2f%%' % np.mean(avg_acc))

--------------------------------
FOLD 0
--------------------------------
Accuracy for fold 0: 98.52%
--------------------------------
FOLD 1
--------------------------------
Accuracy for fold 1: 98.78%
--------------------------------
FOLD 2
--------------------------------
Accuracy for fold 2: 99.35%
--------------------------------
FOLD 3
--------------------------------
Accuracy for fold 3: 98.00%
--------------------------------
FOLD 4
--------------------------------
Accuracy for fold 4: 99.57%
--------------------------------
Finished K-Fold Cross Validation
Average accuracy: 98.84%
