# 1. Imports

In [1]:
from sklearn.decomposition import PCA
import matplotlib as mpl 
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler


from utils import stratified_train_test_group_kfold
from utils import model_test_classification

from classification_codes import knc_gridsearch



0.10.1


# 2. Data

In [2]:

with open("Data/MoS2_Analysis_Cropped_trained", "rb") as fp:   # Unpickling
    MoS2_Proj1_Class_Data = pickle.load(fp)

#features = MoS2_ImageNet_100_data[0]
df = pd.DataFrame(MoS2_Proj1_Class_Data)
#T_dict= {900:0, 950:1, 1000:2}
T_target = np.array([item for item in df['T']])

features = np.array(list(df['ImageNet']))
sampleId = [item for item in df['sampleId']]

X = features
Y = np.array(T_target)
groups = np.array(sampleId)


train_val_groups, train_val_X, train_val_Y, test_X, test_Y = stratified_train_test_group_kfold(X, Y, groups, n_splits=10, test_fold=0)

scaler = StandardScaler().fit(X)
X = scaler.transform(X)
#val_X = scaler.transform(val_X)

Y = Y.flatten()



print(train_X.shape)
print(train_Y.shape)
print(val_X.shape)
print(val_Y.shape)


# 3. Runing Cross-Validation

In [5]:
N_neighbors = [i for i in range(3, 15)]
Weights = ['uniform', 'distance']
Algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
P = [i for i in range(1, 6)]


def cross_10_folds_knn(X, Y):
    
    best_train = []
    best_val = []
    best_variables = []
    for fold in range(10):
        
        model_path = f'classification/ImageNet/aug1/KNC_{fold}_model.sav'
        group, train_X, train_Y, val_X, val_Y = stratified_train_test_group_kfold(X, Y, groups, n_splits=10, test_fold=fold)
        
        #train_X, train_Y = oversample.fit_resample(train_X, train_Y)
        variables_best, performance_best = knc_gridsearch(train_X, train_Y, val_X, val_Y, N_neighbors, Weights, Algorithm, P, fold, model_path)
        best_train.append(performance_best['accuracy_train'])
        best_val.append(performance_best['accuracy_val'])
        best_variables.append(variables_best)
        print(f'fold: {fold} done!')
    return best_train, best_val, best_variables


best_train, best_val, best_variables = cross_10_folds_knn(X, Y)
print(best_train)
print(best_val)
print(f'train_acc_mean: {np.mean(best_train)}, std: {np.std(best_train)}, val_acc_mean: {np.mean(best_val)}, std: {np.std(best_val)}')

<class 'generator'>
best_train_acc: 1.000, best_val_acc: 0.432
best_train_acc: 1.000, best_val_acc: 0.459
best_train_acc: 1.000, best_val_acc: 0.541
best_train_acc: 1.000, best_val_acc: 0.568
best_train_acc: 0.910, best_val_acc: 0.595
fold: 0 done!
<class 'generator'>
best_train_acc: 1.000, best_val_acc: 0.149
best_train_acc: 1.000, best_val_acc: 0.270
best_train_acc: 1.000, best_val_acc: 0.297
best_train_acc: 0.898, best_val_acc: 0.392
best_train_acc: 0.821, best_val_acc: 0.500
best_train_acc: 0.742, best_val_acc: 0.514
fold: 1 done!
<class 'generator'>
best_train_acc: 1.000, best_val_acc: 0.347
best_train_acc: 0.901, best_val_acc: 0.431
best_train_acc: 0.737, best_val_acc: 0.444
best_train_acc: 0.716, best_val_acc: 0.514
best_train_acc: 0.721, best_val_acc: 0.528
best_train_acc: 1.000, best_val_acc: 0.569
fold: 2 done!
<class 'generator'>
best_train_acc: 1.000, best_val_acc: 0.403
best_train_acc: 1.000, best_val_acc: 0.528
best_train_acc: 0.895, best_val_acc: 0.556
best_train_acc: 0.

In [6]:
import numpy as np
from sklearn.metrics import f1_score


def f1score_fn(X, Y):

    F1_score_train = []
    F1_score_val = []
    for fold in range(10):
        
        group, train_X, train_Y, val_X, val_Y = stratified_train_test_group_kfold(X, Y, groups, n_splits=10, test_fold=fold)
        #train_X, train_Y = oversample.fit_resample(train_X, train_Y)
        model_path = f'classification/ImageNet/aug1/KNC_{fold}_model.sav'
        
        PATH = os.path.join('Model', model_path)
        
        loaded_model = pickle.load(open(PATH, 'rb'))

        pred_val_Y = loaded_model.predict(val_X)
        pred_train_Y = loaded_model.predict(train_X)

        f_score_train = f1_score(train_Y, pred_train_Y, average='macro')#'weighted')
        f_score_val = f1_score(val_Y, pred_val_Y, average='macro')#'weighted')

        F1_score_train.append(f_score_train)
        F1_score_val.append(f_score_val)
    
    return F1_score_train, F1_score_val



F1_score_train, F1_score_val = f1score_fn(X, Y)


print(f'f1_train: {np.mean(F1_score_train)}, std: {np.std(F1_score_train)}, f1_val: {np.mean(F1_score_val)}, std: {np.std(F1_score_val)}')


<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
f1_train: 0.8753405583079473, std: 0.1375301828154821, f1_val: 0.47929096130042026, std: 0.1250943793043801
