In [3]:
# Import
import pandas
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [4]:

# import some data from a predefined datatset
iris = datasets.load_iris()

train = pandas.read_csv('TrainingDataMulti.csv')
test = pandas.read_csv('TestingDataMulti.csv')

features = 128

# Labels vector
y = train['marker']

# Features vector
x = train.iloc[:, :features]

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 70% training and 30% test


In [6]:
# Define a Classifier
forest = RandomForestClassifier()

#Define a Principal component analysis
pca = PCA()

# Define a Standard Scaler to normalize inputs
scaler = StandardScaler()

# Define a pipeline to search for the best combination of PCA truncation
# and classifier regularization.
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("forest", forest)])

# Define set of parameters to GridSearch
param_grid = {
    "pca__n_components": [5, 15, 30, 45, 60, 80, 100],
    "forest__n_estimators": [50, 100, 150, 200],
}

# Find best hyperparameters
search = GridSearchCV(pipe, param_grid, n_jobs=2,cv=5)
search.fit(X_train, y_train)

# Predict trained dataset's labels
search_pred = search.predict(X_test)

# Print hyperparameters
print("Best parameters: ",search.best_params_)

# Evaluate the model
accuracy = accuracy_score(y_test, search_pred)
precision = precision_score(y_test, search_pred, average='macro')
recall = recall_score(y_test, search_pred, average='macro')
f1 = f1_score(y_test, search_pred, average='macro')
print("Accuracy: ", accuracy)
print("Precision: ",precision)
print("Recall: ", recall)
print("F1: ", f1)

Best parameters:  {'forest__n_estimators': 100, 'pca__n_components': 30}
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       906
           1       0.83      0.81      0.82       478
           2       0.79      0.81      0.80       416

    accuracy                           0.89      1800
   macro avg       0.86      0.86      0.86      1800
weighted avg       0.89      0.89      0.89      1800



In [10]:
# Use model to predict unlabelled dataset
test_pred = search.predict(test)
print(test_pred)
print(test.shape, test_pred.shape)


Accuracy:  0.885
[2 2 2 1 1 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 2 2 1 0 0 0 0 0 0 0 1 1 1 1
 1 1 2 2 2 2 2 1 1 1 2 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 2 2 2 1 1 1 1 0 0 0 0 0 0 0 0 0]
(100, 128) (100,)


In [11]:
# Append predicted labels to test dataset
op = test.assign(marker=test_pred)

# Output csv file
op.to_csv('TestingResultsMulti.csv', index=False)