In [12]:
#Taken from Scikit

import matplotlib.pyplot as plt
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn import datasets, svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np


In [13]:

# import some data from a predefined datatset
iris = datasets.load_iris()

train = pandas.read_csv('TrainingDataBinary.csv')
test = pandas.read_csv('TestingDataBinary.csv')

features = 128
y = train['marker']
x = train.iloc[:, :features]

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 70% training and 30% test


In [16]:
logreg = LogisticRegression(max_iter=100000)
pca = PCA()
# Define a Standard Scaler to normalize inputs
scaler = StandardScaler()
# Define a pipeline to search for the best combination of PCA truncation
# and classifier regularization.
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("logistic", logreg)])

param_grid = {
    "pca__n_components": [5, 15, 30, 45, 60, 80, 100],
    "logistic__C": np.logspace(-1, 1, 10),
}

search = GridSearchCV(pipe, param_grid, n_jobs=2,cv=5)
search.fit(X_train, y_train)

search_pred = search.predict(X_test)

print("Best parameters: ",search.best_params_)

print("Accuracy: ",accuracy_score(y_test, search_pred))

# Calculate precision
precision = precision_score(y_test, search_pred)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, search_pred)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, search_pred)
print("F1 score:", f1)

Best parameters:  {'logistic__C': 10.0, 'pca__n_components': 100}
Accuracy:  0.9005555555555556
Precision: 0.922543352601156
Recall: 0.8769230769230769
F1 score: 0.8991549295774649


In [18]:
# logreg = LogisticRegression(max_iter=100000, C=search.best_params_['logistic__C'])
# pca = PCA(n_components=search.best_params_['pca__n_components'])
# # Define a Standard Scaler to normalize inputs
# scaler = StandardScaler()
# # Define a pipeline to search for the best combination of PCA truncation
# # and classifier regularization.
# pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("logistic", logreg)])
# pipe.fit(X_train, y_train)

test_pred = search.predict(test)
print(test_pred)
print(test.shape, test_pred.shape)


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 0 0 1 1 1 1 1 1 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
(100, 128) (100,)


In [57]:
# Append predicted labels to test dataset
op = test.assign(marker=test_pred)

# Output csv file
op.to_csv('TestingResultsBinary.csv', index=False)