In [1]:
#Taken from Scikit

import matplotlib.pyplot as plt
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn import datasets, svm
from sklearn.metrics import classification_report, accuracy_score
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

In [14]:

# import some data from a predefined datatset
iris = datasets.load_iris()

train = pandas.read_csv('TrainingDataMulti.csv')
test = pandas.read_csv('TestingDataMulti.csv')

features = 128
y = train['marker']
x = train.iloc[:, :features]

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 70% training and 30% test


In [15]:
forest = RandomForestClassifier()
pca = PCA()
# Define a Standard Scaler to normalize inputs
scaler = StandardScaler()
# Define a pipeline to search for the best combination of PCA truncation
# and classifier regularization.
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("forest", forest)])

param_grid = {
    "pca__n_components": [5, 15, 30, 45, 60, 80, 100],
    "forest__n_estimators": [50, 100, 150, 200],
}

search = GridSearchCV(pipe, param_grid, n_jobs=2,cv=5)
search.fit(X_train, y_train)

search_pred = search.predict(X_test)

print("Best parameters: ",search.best_params_)

# Assuming y_true contains the true labels and y_pred contains the predicted labels
report = classification_report(y_test, search_pred)
print("Classification Report:")
print(report)

Best parameters:  {'forest__n_estimators': 200, 'pca__n_components': 30}
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       894
           1       0.83      0.85      0.84       421
           2       0.88      0.81      0.84       485

    accuracy                           0.90      1800
   macro avg       0.88      0.88      0.88      1800
weighted avg       0.90      0.90      0.90      1800



In [16]:
# logreg = LogisticRegression(max_iter=100000, C=search.best_params_['logistic__C'])
# pca = PCA(n_components=search.best_params_['pca__n_components'])
# # Define a Standard Scaler to normalize inputs
# scaler = StandardScaler()
# # Define a pipeline to search for the best combination of PCA truncation
# # and classifier regularization.
# pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("logistic", logreg)])
# pipe.fit(X_train, y_train)
print("Accuracy: ",accuracy_score(y_test, search_pred))
test_pred = search.predict(test)
print(test_pred)
print(test.shape, test_pred.shape)


Accuracy:  0.9005555555555556
[2 1 1 2 2 2 1 1 2 2 2 1 1 2 2 2 2 1 2 2 2 1 0 0 0 0 0 2 2 0 0 0 0 1 1 1 1
 1 1 1 1 2 2 2 1 0 0 1 2 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 2 2 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
(100, 128) (100,)


In [17]:
# Append predicted labels to test dataset
op = test.assign(marker=test_pred)

# Output csv file
op.to_csv('TestingResultsMulti.csv', index=False)

In [None]:
# Assuming y_true contains the true labels and y_pred contains the predicted labels
precision = precision_score(y_test, search_pred, average='macro')
recall = recall_score(y_test, search_pred, average='macro')
f1 = f1_score(y_test, search_pred, average='macro')