In [1]:
from util import load_numpy_data_from_split
X_tr, y_tr, X_val, y_val, X_te, y_te = load_numpy_data_from_split(resize_images=False)
num_features_in_first_data_point = X_tr[0].shape[0]
print("Number of features in the first data point:", num_features_in_first_data_point)
print(X_tr.shape, X_val.shape, X_te.shape)



Number of features in the first data point: 73030
(799, 73030) (100, 73030) (100, 73030)


In [2]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# normalize by feature
scaler.fit(X_tr)

# apply to datasets
X_tr_scaled = scaler.transform(X_tr)
X_val_scaled = scaler.transform(X_val)
X_te_scaled = scaler.transform(X_te)

pca = PCA()
principalComponents = pca.fit_transform(X_tr_scaled)

print("Explained variance ratio:\n", pca.explained_variance_ratio_)
print("Cumulative explained variance:\n", np.cumsum(pca.explained_variance_ratio_))

n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.999) + 1
print("Number of components explaining 99.9% variance: ", n_components)

pca = PCA(n_components=n_components)
X_tr_p = pca.fit_transform(X_tr_scaled)
X_val_p = pca.transform(X_val_scaled)
X_te_p = pca.transform(X_te_scaled)

Explained variance ratio:
 [2.05311984e-01 4.17890027e-02 2.47591455e-02 1.37391975e-02
 1.05562033e-02 8.82426184e-03 7.28303427e-03 5.90902101e-03
 5.42792445e-03 5.03229350e-03 4.79841791e-03 4.59604943e-03
 4.24889009e-03 4.09477344e-03 3.99180967e-03 3.65215773e-03
 3.61169269e-03 3.52519331e-03 3.41946958e-03 3.33110359e-03
 3.25680850e-03 3.13063036e-03 3.06395674e-03 2.96723121e-03
 2.95070210e-03 2.92282458e-03 2.89649726e-03 2.83088814e-03
 2.77206488e-03 2.71362788e-03 2.65356433e-03 2.64333980e-03
 2.63070664e-03 2.57565523e-03 2.55641225e-03 2.53908359e-03
 2.47714343e-03 2.46588560e-03 2.45357724e-03 2.43100803e-03
 2.41284468e-03 2.37898785e-03 2.34946096e-03 2.33810814e-03
 2.31383601e-03 2.30114651e-03 2.29048287e-03 2.27103638e-03
 2.24809023e-03 2.22032773e-03 2.17491342e-03 2.15789001e-03
 2.14207382e-03 2.13187397e-03 2.11567921e-03 2.10133102e-03
 2.09160196e-03 2.07458483e-03 2.05421285e-03 2.03801575e-03
 2.02954444e-03 2.00637337e-03 1.98919978e-03 1.97551609e-

In [17]:

from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from statistics import mode

def do_knn(X_tr, y_tr, X_val, y_val, k, metric):
    # regular knn for comparison
    y_pred = np.zeros((k, len(y_val)))
    nbrs = NearestNeighbors(n_neighbors=k, metric=metric, n_jobs=-1).fit(X_tr)
    distances, neighbors_list = nbrs.kneighbors(X_val)
    for (index, neighbors) in enumerate(neighbors_list):
        for j in range(k):
            predicted = mode(y_tr[neighbors[0:j+1]])
            y_pred[j, index] = predicted
                
    return y_pred

# KNN grid search
metrics = ["euclidean", "cityblock", "cosine", "correlation"]
accuracy_scores = np.zeros((4, 9))
for (metric_idx, j) in enumerate(metrics):
    y_pred = do_knn(X_tr_scaled, y_tr, X_val_scaled, y_val, 9, metric=j)
    for (k_idx, pred) in enumerate(y_pred):
        accuracy_scores[metric_idx, k_idx] = accuracy_score(y_val, pred)

max_idx = np.unravel_index(np.argmax(accuracy_scores, axis=None), accuracy_scores.shape)
max_metric = metrics[max_idx[0]]
max_k = max_idx[1] + 1  # Adding 1 because k starts from 1, not 0
max_accuracy_score = accuracy_scores[max_idx]

print(accuracy_scores)
print("Maximum validation accuracy:", max_accuracy_score)
print("Optimal k:", max_k)
print("Optimal Distance Metric:", max_metric)

[[0.35 0.35 0.36 0.33 0.32 0.3  0.31 0.3  0.3 ]
 [0.34 0.34 0.32 0.31 0.28 0.26 0.27 0.26 0.26]
 [0.43 0.43 0.41 0.36 0.37 0.36 0.35 0.35 0.33]
 [0.46 0.46 0.46 0.46 0.41 0.41 0.42 0.41 0.42]]
Maximum validation accuracy: 0.46
Optimal k: 1
Optimal Distance Metric: correlation


In [18]:
knbrs = KNeighborsClassifier(n_neighbors=max_k, metric=max_metric)
knbrs.fit(X_tr_scaled, y_tr)
accuracy = knbrs.score(X_te_scaled, y_te)
print("knn test accuracy:", accuracy)

knn test accuracy: 0.41


In [20]:

from sklearn.svm import SVC
from sklearn.svm import LinearSVC

C = [np.float_power(2,i) for i in range(-5,5)]
kernels = ['poly', 'rbf', 'sigmoid'] 
kernel_score_dict = {"poly": [], "rbf": [], "sigmoid": []}

for kern in kernels:
    print(f'validation for kernel {kern}')
    for reg in C: 
        print(f'C={reg}')
        clf = SVC(C = reg, kernel = kern, max_iter = 1000).fit(X_tr_p, y_tr)
        preds = clf.predict(X_val_p)
        score = accuracy_score(y_val, preds) 
        kernel_score_dict[kern].append(score)

print(kernel_score_dict)

## SVM LINEAR

svc_scores = []
print(f'validation for linear')
for rg in C: 
    print(f'C={rg}')
    clf = LinearSVC(C = rg, max_iter=1000).fit(X_tr_p, y_tr)
    preds = clf.predict(X_val_p)
    score = accuracy_score(y_val, preds)
    svc_scores.append(score)

print(svc_scores)

validation for kernel poly
C=0.03125
C=0.0625
C=0.125
C=0.25
C=0.5
C=1.0
C=2.0
C=4.0
C=8.0
C=16.0
validation for kernel rbf
C=0.03125
C=0.0625
C=0.125
C=0.25
C=0.5
C=1.0
C=2.0
C=4.0
C=8.0
C=16.0
validation for kernel sigmoid
C=0.03125
C=0.0625
C=0.125
C=0.25
C=0.5
C=1.0
C=2.0




C=4.0
C=8.0
C=16.0
{'poly': [0.1, 0.1, 0.12, 0.12, 0.18, 0.28, 0.32, 0.31, 0.29, 0.34], 'rbf': [0.32, 0.32, 0.32, 0.32, 0.35, 0.44, 0.49, 0.49, 0.49, 0.49], 'sigmoid': [0.21, 0.21, 0.26, 0.34, 0.4, 0.45, 0.47, 0.47, 0.47, 0.48]}
validation for linear
C=0.03125




C=0.0625




C=0.125




C=0.25




C=0.5




C=1.0




C=2.0




C=4.0




C=8.0




C=16.0
[0.48, 0.48, 0.48, 0.48, 0.47, 0.47, 0.47, 0.47, 0.48, 0.47]




In [22]:
# test kernel
clf = SVC(C = 2.0, kernel = 'rbf', max_iter = 1000).fit(X_tr_p, y_tr)
preds = clf.predict(X_te_p)
print('kernel accuracy', accuracy_score(y_te, preds))

# test linear
clf = LinearSVC(C = 0.03125, max_iter=1000).fit(X_tr_p, y_tr)
preds = clf.predict(X_te_p)
print('linear accuracy', accuracy_score(y_te, preds))

kernel accuracy 0.47
linear accuracy 0.46




In [16]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.multiclass import OneVsOneClassifier

# RF test
Ts = range(100, 901, 100)
rf_scores = []
bag_scores = []
grad_scores = []
for T in Ts:
    print(f"T = {T}")
    best_rf = RandomForestClassifier(n_estimators = T, n_jobs = -1).fit(X_tr_p, y_tr) 
    best_rf_preds = best_rf.predict(X_val_p)
    rf_score = accuracy_score(y_val, best_rf_preds)
    rf_scores.append(rf_score)

    bag = BaggingClassifier(n_estimators = T, n_jobs = -1).fit(X_tr_p, y_tr) 
    bag_preds = bag.predict(X_val_p)
    bag_score = accuracy_score(y_val, bag_preds)
    bag_scores.append(bag_score)

    gb_clf = GradientBoostingClassifier(n_estimators=T, random_state=10)
    ovo_clf = OneVsOneClassifier(gb_clf, n_jobs=-1)
    ovo_clf.fit(X_tr_p, y_tr)
    grad_preds = ovo_clf.predict(X_te_p) 
    grad_score = accuracy_score(y_te, grad_preds)
    grad_scores.append(grad_score)

print(rf_scores)
print(bag_scores)
print(grad_scores)

T = 100
T = 200
T = 300
T = 400
T = 500
T = 600
T = 700
T = 800
T = 900
[0.3, 0.35, 0.35, 0.35, 0.31, 0.31, 0.36, 0.34, 0.35]
[0.34, 0.39, 0.36, 0.38, 0.35, 0.39, 0.37, 0.37, 0.39]
[0.55, 0.53, 0.5, 0.49, 0.49, 0.49, 0.49, 0.49, 0.49]


In [19]:
best_rf = RandomForestClassifier(n_estimators = 700, n_jobs = -1).fit(X_tr_p, y_tr)
best_rf_preds = best_rf.predict(X_te_p)
print('rf accuracy', accuracy_score(y_te, best_rf_preds))

bag = BaggingClassifier(n_estimators = 200, n_jobs = -1).fit(X_tr_p, y_tr) 
bag_preds = bag.predict(X_te_p)
print('bagging accuracy', accuracy_score(y_te, bag_preds))

gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=10)
ovo_clf = OneVsOneClassifier(gb_clf, n_jobs=-1)
ovo_clf.fit(X_tr_p, y_tr)
grad_preds = ovo_clf.predict(X_te_p) 
grad_score = accuracy_score(y_te, grad_preds)
print('grad accuracy', grad_score)

gb_clf = HistGradientBoostingClassifier(learning_rate=1, random_state=10)
ovo_clf = OneVsOneClassifier(gb_clf, n_jobs=-1)
ovo_clf.fit(X_tr_p, y_tr)
hist_grad_preds = ovo_clf.predict(X_te_p) 
hist_grad_score = accuracy_score(y_te, hist_grad_preds)
print('hist grad accuracy', hist_grad_score)

rf accuracy 0.42
bagging accuracy 0.49
grad accuracy 0.55
hist grad accuracy 0.44


In [13]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
T = 200
adaboost_stats = []
for i in list(range(10,15)):
    print(f'max_depth={i}')
    tree = DecisionTreeClassifier(max_depth=i)
    a_clf = AdaBoostClassifier(
        tree, n_estimators=T, learning_rate=1, random_state=10
    )
    a_clf.fit(X_tr_p, y_tr)
    preds = a_clf.predict(X_val_p)
    score = accuracy_score(y_val, preds)
    adaboost_stats.append(score)
print(adaboost_stats)

max_depth=10
max_depth=11
max_depth=12
max_depth=13
max_depth=14
[0.35, 0.32, 0.31, 0.31, 0.28]


In [14]:
a_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=10), n_estimators=200, learning_rate=1, random_state=10
)
a_clf.fit(X_tr_p, y_tr)
preds = a_clf.predict(X_te_p)
print('adaboost accuracy', accuracy_score(y_te, preds))

adaboost accuracy 0.33
