In [1]:
#Taken from Scikit

import pandas
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np


In [2]:
# import some data from a predefined datatset
iris = datasets.load_iris()

train = pandas.read_csv('TrainingDataBinary.csv')
test = pandas.read_csv('TestingDataBinary.csv')

features = 128

# Labels vector 
y = train['marker']

#Features vector
x = train.iloc[:, :features]

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 70% training and 30% test


In [3]:
# Define a Classifier
logreg = LogisticRegression(max_iter=100000)

# Define a Principal component analysis
pca = PCA()

# Define a Standard Scaler to normalize inputs
scaler = StandardScaler()

# Define a pipeline to search for the best combination of PCA truncation
# and classifier regularization.
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("logistic", logreg)])

# Define set of parameters to GridSearch
param_grid = {
    "pca__n_components": [5, 15, 30, 45, 60, 80, 100],
    "logistic__C": np.logspace(-1, 1, 10),
}

# Do GridSearch to find most optimal hyperparameters
search = GridSearchCV(pipe, param_grid, n_jobs=2,cv=5)

# Fit the model in train dataset
search.fit(X_train, y_train)

# Predict train labels
search_pred = search.predict(X_test)

# Print best hyperparameters found
print("Best parameters: ",search.best_params_)

# Evaluate the performance of the model
print("Accuracy: ",accuracy_score(y_test, search_pred))

# Calculate precision
precision = precision_score(y_test, search_pred)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, search_pred)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, search_pred)
print("F1 score:", f1)

Best parameters:  {'logistic__C': 10.0, 'pca__n_components': 100}
Accuracy:  0.8961111111111111
Precision: 0.9233449477351916
Recall: 0.8679039301310044
F1 score: 0.8947664603263927


In [4]:
# Use the model to predict unlabelled data.
test_pred = search.predict(test)
print(test_pred)
print(test.shape, test_pred.shape)


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 0 0 1 1 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
(100, 128) (100,)


In [5]:
# Append predicted labels to test dataset
op = test.assign(marker=test_pred)

# Output csv file
op.to_csv('TestingResultsBinary.csv', index=False)