# 1.  Imports and Data

In [1]:
from sklearn.decomposition import PCA
import matplotlib as mpl 
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


from codes.utils import stratified_train_test_group_kfold
from codes.utils import pred2class
from codes.utils import model_test_regression
from codes.regression_codes import svr_gridsearch



with open("Data/MoS2_Analysis_Data_trained2", "rb") as fp:   # Unpickling
    MoS2_Proj1_Class_Data = pickle.load(fp)

#features = MoS2_ImageNet_100_data[0]
df = pd.DataFrame(MoS2_Proj1_Class_Data)
print(df.keys())
T_target = np.array(list(df['T']))
features = np.array(list(df['ImageNet']))
sampleId = np.array(list(df['sampleId']))
train_val_sampleId, test_sampleId, train_val_Y, test_Y = train_test_split(sampleId, T_target, test_size=0.1, stratify = T_target, random_state=40)

Index(['sampleId', 'sampleLabel', 'image', 'T', 'ImageNet', 'MicroNet'], dtype='object')


# 2. Data Splitting and Scaling

In [2]:


X = features
Y = np.array(T_target)
groups = np.array(sampleId)




train_val_groups, train_val_X, train_val_Y, test_X, test_Y = stratified_train_test_group_kfold(X, Y, groups, n_splits=10, test_fold=0)
train_val_Y = train_val_Y.flatten()
#val_Y = val_Y.flatten()
test_Y = test_Y.flatten()

print(train_val_X.shape)
print(train_val_Y.shape)
print(test_X.shape)
print(test_Y.shape)


oversample = RandomOverSampler(sampling_strategy='not majority')

scaler = StandardScaler().fit(train_val_X)
train_val_X = scaler.transform(train_val_X)
test_X = scaler.transform(test_X)

<class 'generator'>
(235, 100)
(235,)
(27, 100)
(27,)


# 3. Runing Cross-Validation

In [3]:
oversample = RandomOverSampler(sampling_strategy='not majority')

Kernel = ['linear']#, 'poly', 'rbf', 'sigmoid']#, 'precomputed']
Degree = [i for i in range(1, 11)]
Gamma = ['scale', 'auto']
Coef0 = list(np.arange(0,0.5,0.05))
Ce = list(np.arange(0.1, 1.1, 0.1))
#print(Ce)
Epsilon = list(np.arange(0.1, 1.1, 0.1))
Shrinking = [False, True]


def cross_10_folds_svc(train_val_X, train_val_Y):
    
    best_train = []
    best_val = []
    best_variables = []
    for fold in range(10):
        group, train_X, train_Y, val_X, val_Y = stratified_train_test_group_kfold(train_val_X, train_val_Y, train_val_groups, n_splits=10, test_fold=fold)
          
        model_path = f'regression/ImageNet/aug3/SVR_{fold}_model.sav'
        
        train_X, train_Y = oversample.fit_resample(train_X, train_Y)
        variables_best, performance_best = svr_gridsearch(train_X, train_Y, val_X, val_Y, Kernel, Degree, Gamma, Coef0, Ce,Epsilon,Shrinking,fold, model_path)
        best_train.append(performance_best['accuracy_train'])
        best_val.append(performance_best['accuracy_val'])
        best_variables.append(variables_best)
        print(f'fold: {fold} done!')
    return best_train, best_val, best_variables

best_train, best_val, best_variables = cross_10_folds_svc(train_val_X, train_val_Y)

print(best_train)
print(best_val)
print(f'train_acc_mean: {np.mean(best_train) :.3f}, std: {np.std(best_train) :.3f}, val_acc_mean: {np.mean(best_val):.3f}, std: {np.std(best_val):.3f}')

<class 'generator'>
best_train_acc: 0.6888888888888889, best_val_acc: 0.6666666666666666
best_train_acc: 0.765079365079365, best_val_acc: 0.7083333333333334
fold: 0 done!
<class 'generator'>
best_train_acc: 0.660377358490566, best_val_acc: 0.625
best_train_acc: 0.7484276729559748, best_val_acc: 0.6666666666666666
best_train_acc: 0.7452830188679245, best_val_acc: 0.7083333333333334
fold: 1 done!
<class 'generator'>
best_train_acc: 0.6540880503144654, best_val_acc: 0.6666666666666666
fold: 2 done!
<class 'generator'>
best_train_acc: 0.653968253968254, best_val_acc: 0.8333333333333334
fold: 3 done!
<class 'generator'>
best_train_acc: 0.6349206349206349, best_val_acc: 0.7916666666666666
fold: 4 done!
<class 'generator'>
best_train_acc: 0.653968253968254, best_val_acc: 0.8695652173913043
fold: 5 done!
<class 'generator'>
best_train_acc: 0.653968253968254, best_val_acc: 0.6086956521739131
fold: 6 done!
<class 'generator'>
best_train_acc: 0.6793650793650794, best_val_acc: 0.6086956521739131
b

# 4. Model Testing

In [4]:
best_fold = 0

model_path = f'regression/ImageNet/aug3/SVR_{best_fold}_model.sav'


# Fitting different train in the same hyperparameters




best_test, root_mean_squared_error, confusion_matrix_test = model_test_regression(train_val_X, train_val_Y, train_val_groups,test_X, test_Y, best_fold, model_path)

print('best_test: ', best_test)
print('rmse: ', root_mean_squared_error)
print(f'test_acc_mean: {np.mean(best_test) :.3f}, std: {np.std(best_test) :.3f}, rmse_mean: {np.mean(root_mean_squared_error)}, rmse_std: {np.std(root_mean_squared_error)}')

print("......")

print('5th confusion matrix: ', [list(item) for item in confusion_matrix_test[4]])

<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
<class 'generator'>
best_test:  [0.5185185185185185, 0.5555555555555556, 0.48148148148148145, 0.48148148148148145, 0.48148148148148145, 0.5555555555555556, 0.6666666666666666, 0.5555555555555556, 0.5185185185185185, 0.48148148148148145]
rmse:  [28.719602685512527, 32.204876041117444, 29.012500521160845, 32.9964765405641, 33.41696481398423, 27.876175317115788, 26.733961553065598, 31.473341634041777, 30.8989021016559, 28.931445779424433]
test_acc_mean: 0.530, std: 0.055, rmse_mean: 30.226424698764266, rmse_std: 2.166012389580321
......
5th confusion matrix:  [[1, 2, 0], [3, 8, 2], [0, 7, 4]]


In [5]:
best_fold = 0
PATH = f'Models/regression/ImageNet/aug3/SVR_{best_fold}_model.sav'

#PATH = os.path.join('Models', model_path)
loaded_model = pickle.load(open(PATH, 'rb'))
loaded_model.get_params()

{'C': 0.5,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 1,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_iter': -1,
 'shrinking': False,
 'tol': 0.001,
 'verbose': False}