In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

# Path to the CSV file
# csv_file_path = './data/features_30_sec.csv'
csv_file_path = './data/features_3_sec.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

labels, unique_values = pd.factorize(df["label"])
y = np.array(labels)
columns_to_exclude = ["filename", "length", "label"]
X = df.drop(columns=columns_to_exclude)
X = X.to_numpy()

In [5]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(
    X,
    y,
    random_state=10, 
    test_size=0.2, 
    shuffle=True
)
np.shape(X)

(9990, 57)

In [41]:
from sklearn.neighbors import NearestNeighbors
from statistics import mode

def do_knn(X_tr, y_tr, X_te, y_te, k, metric):
    # regular knn for comparison
    y_pred = np.zeros((k, len(y_te)))
    nbrs = NearestNeighbors(n_neighbors=k, metric=metric, n_jobs=-1).fit(X_tr)
    distances, neighbors_list = nbrs.kneighbors(X_te)
    for (index, neighbors) in enumerate(neighbors_list):
        for j in range(k):
            predicted = mode(y_tr[neighbors[0:j+1]])
            y_pred[j, index] = predicted
                
    return y_pred

# KNN grid search
metrics = ["euclidean", "cityblock", "cosine", "correlation"]
f1_scores = np.zeros((4, 9))
for (metric_idx, j) in enumerate(metrics):
    y_pred = do_knn(X_tr, y_tr, X_te, y_te, 9, metric=j)
    for (k_idx, pred) in enumerate(y_pred):
        f1_scores[metric_idx, k_idx] = f1_score(y_te, pred, average = 'weighted')
print(f1_scores)

[[0.25832865 0.25832865 0.27689747 0.27253836 0.28424416 0.27806402
  0.28771651 0.30122999 0.30161326]
 [0.28711491 0.28711491 0.29035182 0.30718813 0.30897587 0.30819699
  0.31150259 0.31509948 0.31639468]
 [0.26527937 0.26527937 0.27049472 0.27068322 0.2725737  0.27693049
  0.27748912 0.28051229 0.27476306]
 [0.26551487 0.26551487 0.27085829 0.27021867 0.27136074 0.2757249
  0.28062028 0.27961643 0.27603489]]


In [6]:

from sklearn.svm import SVC
from sklearn.svm import LinearSVC

C = [np.float_power(2,i) for i in range(-5,5)]
kernels = ['poly', 'rbf', 'sigmoid'] 
kernel_score_dict = {"poly": [], "rbf": [], "sigmoid": []}

for kern in kernels:
    print(f'validation for kernel {kern}')
    for reg in C: 
        print(f'C={reg}')
        clf = SVC(C = reg, kernel = kern, max_iter = 1000).fit(X_tr, y_tr)
        preds = clf.predict(X_te)
        score = f1_score(y_te, preds, average = 'weighted') 
        kernel_score_dict[kern].append(score)

print(kernel_score_dict)

## SVM LINEAR

svc_scores = []
print(f'validation for linear')
for rg in C: 
    print(f'C={rg}')
    clf = LinearSVC(C = rg, max_iter=1000).fit(X_tr, y_tr)
    preds = clf.predict(X_te)
    score = f1_score(y_te, preds, average = "weighted")
    svc_scores.append(score)

print(svc_scores)

{'poly': [], 'rbf': [], 'sigmoid': []}
validation for linear
C=0.03125




C=0.0625




C=0.125




C=0.25




C=0.5




C=1.0




C=2.0




C=4.0




C=8.0




C=16.0
[0.16909785216988718, 0.16484832970999275, 0.1488683005430971, 0.09959933573786914, 0.2139331898211335, 0.11450931204769171, 0.1735159366563997, 0.1504620910561721, 0.12008064293173254, 0.11670651861300392]




In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

# RF test
Ts = range(100, 901, 100)
rf_scores = []
bag_scores = []
for T in Ts:
    print(f"T = {T}")
    best_rf = RandomForestClassifier(n_estimators = T, n_jobs = -1).fit(X_tr, y_tr) 
    best_rf_preds = best_rf.predict(X_te)
    rf_score = f1_score(y_te, best_rf_preds, average = 'weighted')
    rf_scores.append(rf_score)

    bag = BaggingClassifier(n_estimators = T, n_jobs = -1).fit(X_tr, y_tr) 
    bag_preds = bag.predict(X_te)
    bag_score = f1_score(y_te, bag_preds, average = 'weighted')
    bag_scores.append(bag_score)

print(rf_scores)
print(bag_scores)

T = 100
T = 200
T = 300
T = 400
T = 500
T = 600
T = 700
T = 800
T = 900
[0.8754799868015698, 0.8870865308831211, 0.8821555312165804, 0.8858112876110087, 0.8818824369379256, 0.8822791062794109, 0.8878741994453422, 0.8849957583650253, 0.8880210415803891]
[0.8537496300430408, 0.862370301590455, 0.861995319632887, 0.867951645868158, 0.8585754965917217, 0.8667703462539036, 0.8661935633088172, 0.8659254819375487, 0.8634885387570647]


In [8]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
T = 200
adaboost_stats = []
for i in list(range(10,15)):
    print(f'max_depth={i}')
    tree = DecisionTreeClassifier(max_depth=i)
    a_clf = AdaBoostClassifier(
        tree, n_estimators=T, learning_rate=1, random_state=10
    )
    a_clf.fit(X_tr, y_tr)
    preds = a_clf.predict(X_te)
    score = f1_score(y_te, preds, average = 'weighted')
    adaboost_stats.append(score)
print(adaboost_stats) # optimal max_depth = 13 on 3 sec

max_depth=10
max_depth=11
max_depth=12
max_depth=13
max_depth=14
[0.8685885236088535, 0.8789342196136323, 0.8836291141276624, 0.8939792083821787, 0.8862795171236887]


In [11]:
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.multiclass import OneVsOneClassifier

gb_clf = HistGradientBoostingClassifier(learning_rate=1, random_state=10)
ovo_clf = OneVsOneClassifier(gb_clf, n_jobs=-1)
ovo_clf.fit(X_tr, y_tr)
hist_grad_preds = ovo_clf.predict(X_te) 
hist_grad_score = f1_score(y_te, hist_grad_preds, average = "weighted")
print(hist_grad_score) # 0.8900480050852958 - 3 sec

gb_clf = GradientBoostingClassifier(n_estimators=200, random_state=10)
ovo_clf = OneVsOneClassifier(gb_clf, n_jobs=-1)
ovo_clf.fit(X_tr, y_tr)
grad_preds = ovo_clf.predict(X_te) 
grad_score = f1_score(y_te, grad_preds, average = "weighted")
print(grad_score) # 0.8775735978500271 - 3 sec

0.8900480050852958
0.8862316085701015
