In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
data = pd.read_csv("data/CATSnDOGS.csv")
labels = pd.read_csv("data/labels.csv")

Some pre-processing, ex find number of PC for kNN ans such

In [None]:
# This should maybe only be done for training data
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=42)

trainX_pp = StandardScaler().fit_transform(train_data)
train_data_pca = PCA().fit(trainX_pp)
principal_components_pca = train_data_pca.transform(trainX_pp)

# Plot the importance of the principal components
fig = plt.figure(figsize=(6, 4))
ax = fig.gca()

num_components_to_plot = min(100, len(train_data_pca.explained_variance_ratio_))
ax.plot(np.arange(1, num_components_to_plot + 1), train_data_pca.explained_variance_ratio_[:num_components_to_plot], 'ok', markersize=2)
ax.axvline(30, linestyle="dashed", color="red", linewidth=1)
plt.title('Scree plot')
ax.set_xlabel("Principal Component")
ax.set_ylabel("Explained Variance")
plt.show()

In [None]:
#Top 30 PC looks reasonable 
num_principal_components = 30
num_tests = 50
pca = PCA(n_components=num_principal_components)
scaler = StandardScaler()

#Initialize classifiers
knn_classifier_flexible = KNeighborsClassifier(n_neighbors=3)
knn_classifier_rigid = KNeighborsClassifier(n_neighbors=100)
random_forest = RandomForestClassifier(n_estimators=50, n_jobs=-1)
LDA = LinearDiscriminantAnalysis()
log_reg = LogisticRegression()
svm_classifier = SVC(kernel='rbf')  # Linear kernel for linear SVM

knn_flexible_scores = []
knn_rigid_scores = []
random_forest_scores = []
LDA_scores = []
log_reg_scores = []
svm_scores = []

for i in range(num_tests):
    train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.4)
    
    # Fit the scaler on the training data and transform the training data
    train_data_scaled = scaler.fit_transform(train_data)
    
    # Transform the test data using the already fitted scaler
    test_data_scaled = scaler.transform(test_data)
    
    X_train_pca = pca.fit_transform(train_data_scaled)
    X_test_pca = pca.transform(test_data_scaled)
    
    # Evaluate flexible KNN classifier
    knn_classifier_flexible.fit(X_train_pca, train_labels.values.ravel())
    scores_knn_flexible = accuracy_score(test_labels.values.ravel(), knn_classifier_flexible.predict(X_test_pca))
    knn_flexible_scores.append(scores_knn_flexible)
    
    # Evaluate rigid KNN classifier
    knn_classifier_rigid.fit(X_train_pca, train_labels.values.ravel())
    scores_knn_rigid = accuracy_score(test_labels.values.ravel(), knn_classifier_rigid.predict(X_test_pca))
    knn_rigid_scores.append(scores_knn_rigid)
    
    # Evaluate Random Forest 
    random_forest.fit(train_data, train_labels.values.ravel())
    rf_test_accuracy = accuracy_score(test_labels.values.ravel(), random_forest.predict(test_data))
    random_forest_scores.append(rf_test_accuracy)
    
    #Evaluate Linear Discriminant Analysis
    LDA.fit(train_data, train_labels.values.ravel())
    LDA_test_accuracy = accuracy_score(test_labels.values.ravel(), LDA.predict(test_data))
    LDA_scores.append(LDA_test_accuracy)
    
    # Evaluate Logistic Regression
    log_reg.fit(train_data, train_labels.values.ravel())
    log_reg_test_accuracy = accuracy_score(test_labels.values.ravel(), log_reg.predict(test_data))
    log_reg_scores.append(log_reg_test_accuracy)
    
    # Fit and evaluate the SVM classifier
    svm_classifier.fit(train_data, train_labels.values.ravel())
    scores_svm = accuracy_score(test_labels.values.ravel(), svm_classifier.predict(test_data))
    svm_scores.append(scores_svm)
    
print("Test scores for the test set size 40% ")
print('KNN flexible scores:', np.mean(knn_flexible_scores))
print('KNN rigid scores:', np.mean(knn_rigid_scores))
print('Random forest scores:', np.mean(random_forest_scores))
print('LDA scores:', np.mean(LDA_scores))
print('Logistic regression scores:', np.mean(log_reg_scores))
print('SVM scores:', np.mean(svm_scores))