# 1. Imports

In [1]:
from sklearn.decomposition import PCA
import matplotlib as mpl 
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


from utils import stratified_train_test_group_kfold
from utils import model_test_classification

from classification_codes import svc_gridsearch



# 2. Data

In [2]:
with open("Data/MoS2_Analysis_Data_trained2", "rb") as fp:   # Unpickling
    MoS2_Proj1_Class_Data = pickle.load(fp)

#features = MoS2_ImageNet_100_data[0]
df = pd.DataFrame(MoS2_Proj1_Class_Data)
print(df.keys())
T_dict= {900:0, 950:1, 1000:2}
T_target = np.array([T_dict[item] for item in df['T']])

features = np.array(list(df['ImageNet']))
sampleId = np.array(list(df['sampleId']))

X = features
Y = np.array(T_target)
groups = np.array(sampleId)


train_val_groups, train_val_X, train_val_Y, test_X, test_Y = stratified_train_test_group_kfold(X, Y, groups, n_splits=10, test_fold=0)

train_val_Y = train_val_Y.flatten()
test_Y = test_Y.flatten()

print(train_val_X.shape)
print(train_val_Y.shape)
print(test_X.shape)
print(test_Y.shape)


oversample = RandomOverSampler(sampling_strategy='not majority')

scaler = StandardScaler().fit(train_val_X)
train_val_X = scaler.transform(train_val_X)
test_X = scaler.transform(test_X)

Index(['sampleId', 'sampleLabel', 'image', 'T', 'ImageNet', 'MicroNet'], dtype='object')
<class 'generator'>
(235, 100)
(235,)
(27, 100)
(27,)


# 3. Runing Cross-Validation

In [3]:
Ce = [1.0, 0.8, 0.2]
Kernel= ['poly']#, 'rbf', 'sigmoid']#, 'precomputed']
Degree = [i for i in range(2, 8)]
Gamma = ['scale', 'auto', 0.1, 0.2, 0.3, 0.34]# or float, default=’scale’ 0.34 is the best
Coef0 = [0.8, 0.2, 0.05, 0.0]

Max_iter = [90, 100, 120, -1]
variables = [Ce, Kernel, Degree, Gamma, Coef0, Max_iter]


def cross_10_folds_svc(train_val_X, train_val_Y):
    
    best_train = []
    best_val = []
    best_variables = []
    for fold in range(10):
        
        model_path = f'classification/ImageNet/aug3/SVC_{fold}_model.sav'
        
        group, train_X, train_Y, val_X, val_Y = stratified_train_test_group_kfold(train_val_X, train_val_Y, train_val_groups, n_splits=10, test_fold=fold)
        
        train_X, train_Y = oversample.fit_resample(train_X, train_Y)
        
        variables_best, performance_best = svc_gridsearch(train_X, train_Y, val_X, val_Y, Ce, Kernel, Degree, Gamma, Coef0, Max_iter, fold, model_path)
        best_train.append(performance_best['accuracy_train'])
        best_val.append(performance_best['accuracy_val'])
        best_variables.append(variables_best)
        print(f'fold: {fold} done!')
    return best_train, best_val, best_variables


best_train, best_val, best_variables = cross_10_folds_svc(train_val_X, train_val_Y)
print(best_train)
print(best_val)
print(f'train_acc_mean: {np.mean(best_train) :.3f}, std: {np.std(best_train) :.3f}, val_acc_mean: {np.mean(best_val):.3f}, std: {np.std(best_val):.3f}')


<class 'generator'>
best_train_acc: 0.9365079365079365, best_val_acc: 0.6666666666666666
best_train_acc: 0.9428571428571428, best_val_acc: 0.7083333333333334




best_train_acc: 0.8920634920634921, best_val_acc: 0.75




best_train_acc: 0.7206349206349206, best_val_acc: 0.7916666666666666




fold: 0 done!
<class 'generator'>
best_train_acc: 0.9371069182389937, best_val_acc: 0.8333333333333334




best_train_acc: 0.9371069182389937, best_val_acc: 0.875




fold: 1 done!
<class 'generator'>
best_train_acc: 0.9591194968553459, best_val_acc: 0.6666666666666666
best_train_acc: 0.9559748427672956, best_val_acc: 0.7083333333333334
best_train_acc: 0.9182389937106918, best_val_acc: 0.75




best_train_acc: 0.940251572327044, best_val_acc: 0.7916666666666666




fold: 2 done!
<class 'generator'>
best_train_acc: 0.946031746031746, best_val_acc: 0.75
best_train_acc: 0.946031746031746, best_val_acc: 0.7916666666666666




fold: 3 done!
<class 'generator'>
best_train_acc: 0.9396825396825397, best_val_acc: 0.75
best_train_acc: 0.9396825396825397, best_val_acc: 0.7916666666666666




fold: 4 done!
<class 'generator'>
best_train_acc: 0.9333333333333333, best_val_acc: 0.782608695652174
best_train_acc: 0.9365079365079365, best_val_acc: 0.8260869565217391




best_train_acc: 0.9111111111111111, best_val_acc: 0.8695652173913043




best_train_acc: 0.9301587301587302, best_val_acc: 0.9130434782608695




fold: 5 done!
<class 'generator'>
best_train_acc: 0.9301587301587302, best_val_acc: 0.6086956521739131




best_train_acc: 0.9333333333333333, best_val_acc: 0.6521739130434783
best_train_acc: 1.0, best_val_acc: 0.6956521739130435




best_train_acc: 0.9841269841269841, best_val_acc: 0.7391304347826086




best_train_acc: 1.0, best_val_acc: 0.782608695652174




fold: 6 done!
<class 'generator'>
best_train_acc: 0.9365079365079365, best_val_acc: 0.7391304347826086




best_train_acc: 0.9936507936507937, best_val_acc: 0.782608695652174




best_train_acc: 0.9714285714285714, best_val_acc: 0.8260869565217391




fold: 7 done!
<class 'generator'>
best_train_acc: 0.9396825396825397, best_val_acc: 0.5217391304347826




best_train_acc: 0.8984126984126984, best_val_acc: 0.5652173913043478




best_train_acc: 0.8095238095238095, best_val_acc: 0.6956521739130435




fold: 8 done!
<class 'generator'>
best_train_acc: 0.9276729559748428, best_val_acc: 0.6521739130434783
best_train_acc: 0.9245283018867925, best_val_acc: 0.7391304347826086




fold: 9 done!
[0.7206349206349206, 0.9371069182389937, 0.940251572327044, 0.946031746031746, 0.9396825396825397, 0.9301587301587302, 1.0, 0.9714285714285714, 0.8095238095238095, 0.9245283018867925]
[0.7916666666666666, 0.875, 0.7916666666666666, 0.7916666666666666, 0.7916666666666666, 0.9130434782608695, 0.782608695652174, 0.8260869565217391, 0.6956521739130435, 0.7391304347826086]
train_acc_mean: 0.912, std: 0.079, val_acc_mean: 0.800, std: 0.059




In [4]:
import numpy as np
from sklearn.metrics import f1_score

X, Y = train_val_X, train_val_Y
groups = train_val_groups
def f1score_fn(X, Y):

    F1_score_train = []
    F1_score_val = []
    for fold in range(10):
        
        group, train_X, train_Y, val_X, val_Y = stratified_train_test_group_kfold(X, Y, groups, n_splits=10, test_fold=fold)
        #train_X, train_Y = oversample.fit_resample(train_X, train_Y)
        model_path = f'classification/ImageNet/aug3/SVC_{fold}_model.sav'
        
        PATH = os.path.join('Models', model_path)
        
        loaded_model = pickle.load(open(PATH, 'rb'))

        pred_val_Y = loaded_model.predict(val_X)
        pred_train_Y = loaded_model.predict(train_X)

        f_score_train = f1_score(train_Y, pred_train_Y, average='macro')#'weighted')
        f_score_val = f1_score(val_Y, pred_val_Y, average='macro')#'weighted')

        F1_score_train.append(f_score_train)
        F1_score_val.append(f_score_val)
    
    return F1_score_train, F1_score_val



F1_score_train, F1_score_val = f1score_fn(X, Y)


print(f'f1_train: {np.mean(F1_score_train)}, std: {np.std(F1_score_train)}, f1_val: {np.mean(F1_score_val)}, std: {np.std(F1_score_val)}')


<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
f1_train: 0.9040781612855133, std: 0.07184542528004038, f1_val: 0.708744575922776, std: 0.08798108567845109


# 4. Model Testing

In [4]:
import numpy as np



best_fold =1

model_path = f'classification/ImageNet/aug3/SVC_{best_fold}_model.sav'


# Fitting different train in the same hyperparameters

best_test, confusion_matrix_test = model_test_classification(train_val_X, train_val_Y, train_val_groups,test_X, test_Y, best_fold, model_path)



print(best_test)
print(f'test_acc_mean: {np.mean(best_test) :.3f}, std: {np.std(best_test) :.3f}')

print('5th confusion matrix: ', [list(item) for item in confusion_matrix_test[4]])


<class 'generator'>
<class 'generator'>




<class 'generator'>
<class 'generator'>




<class 'generator'>
<class 'generator'>




<class 'generator'>
<class 'generator'>




<class 'generator'>
<class 'generator'>
[0.5555555555555556, 0.6296296296296297, 0.6666666666666666, 0.7037037037037037, 0.6296296296296297, 0.6666666666666666, 0.6296296296296297, 0.6296296296296297, 0.5925925925925926, 0.7407407407407407]
test_acc_mean: 0.644, std: 0.050
5th confusion matrix:  [[1, 2, 0], [2, 8, 3], [0, 3, 8]]




In [5]:
cm_list = []
for cm in confusion_matrix_test:
    cm_list.append([list(item) for item in cm])
#print(cm_list)

print(len(cm_list))
print([list(item) for item in np.mean(cm_list, axis=0)])

10
[[0.9, 2.1, 0.0], [2.2, 8.5, 2.3], [0.3, 2.5, 8.2]]


In [6]:
best_fold =1
filename = f'Models/classification/ImageNet/aug3/SVC_{best_fold}_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': 'balanced',
 'coef0': 0.8,
 'decision_function_shape': 'ovr',
 'degree': 2,
 'gamma': 'scale',
 'kernel': 'poly',
 'max_iter': 90,
 'probability': False,
 'random_state': 0,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}