# 1.  Imports and Data

In [1]:
from sklearn.decomposition import PCA
import matplotlib as mpl 
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler


from codes.utils import stratified_train_test_group_kfold
from codes.utils import pred2class
from codes.utils import model_test_regression
from codes.regression_codes import rnr_gridsearch



with open("Data/MoS2_Analysis_Data_trained2", "rb") as fp:   # Unpickling
    MoS2_Proj1_Class_Data = pickle.load(fp)

#features = MoS2_ImageNet_100_data[0]
df = pd.DataFrame(MoS2_Proj1_Class_Data)
print(df.keys())
T_target = np.array(list(df['T']))
features = np.array(list(df['ImageNet']))
sampleId = np.array(list(df['sampleId']))
train_val_sampleId, test_sampleId, train_val_Y, test_Y = train_test_split(sampleId, T_target, test_size=0.1, stratify = T_target, random_state=40)

Index(['sampleId', 'sampleLabel', 'image', 'T', 'ImageNet', 'MicroNet'], dtype='object')


# 2. Data Splitting and Scaling

In [2]:


X = features
Y = np.array(T_target)
groups = np.array(sampleId)




train_val_groups, train_val_X, train_val_Y, test_X, test_Y = stratified_train_test_group_kfold(X, Y, groups, n_splits=10, test_fold=0)
train_val_Y = train_val_Y.flatten()
#val_Y = val_Y.flatten()
test_Y = test_Y.flatten()

print(train_val_X.shape)
print(train_val_Y.shape)
print(test_X.shape)
print(test_Y.shape)


oversample = RandomOverSampler(sampling_strategy='not majority')

scaler = StandardScaler().fit(train_val_X)
train_val_X = scaler.transform(train_val_X)
test_X = scaler.transform(test_X)

<class 'generator'>
(235, 100)
(235,)
(27, 100)
(27,)


In [4]:
def pred2class(predicted):
    predicted = predicted.tolist()
    pred_class = []
    for index, item in enumerate(predicted):
        if item <= 925:# 0.5, 925
            pred_class.append(900)
        elif item <=975:# 1.5, 975
            pred_class.append(950)
        elif item >975:#1.5, 975
            pred_class.append(1000)
            
        else:
            pred_class.append(1000)
    
    return pred_class



# 3. Runing Cross-Validation

In [6]:
Radius = [i for i in range(3, 7)]
Weights = ['uniform', 'distance']
Algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
P =[i for i in range(1, 6)]


def cross_10_folds_rnn(train_val_X, train_val_Y):
    
    best_train = []
    best_val = []
    best_variables = []
    for fold in range(10):
        group, train_X, train_Y, val_X, val_Y = stratified_train_test_group_kfold(train_val_X, train_val_Y, train_val_groups, n_splits=10, test_fold=fold)
        
        model_path = f'regression/ImageNet/aug3/RNR_{fold}_model.sav'
        
        train_X, train_Y = oversample.fit_resample(train_X, train_Y)
        variables_best, performance_best = rnr_gridsearch(train_X, train_Y, val_X, val_Y, Radius, Weights, Algorithm, P, fold, model_path)
        best_train.append(performance_best['accuracy_train'])
        best_val.append(performance_best['accuracy_val'])
        best_variables.append(variables_best)
        print(f'fold: {fold} done!')
    return best_train, best_val, best_variables


best_train, best_val, best_variables = cross_10_folds_rnn(train_val_X, train_val_Y)
print(best_train)
print(best_val)
print(f'train_acc_mean: {np.mean(best_train) :.3f}, std: {np.std(best_train) :.3f}, val_acc_mean: {np.mean(best_val):.3f}, std: {np.std(best_val):.3f}')

<class 'generator'>




best_train_acc: 1.0, best_val_acc: 0.375




best_train_acc: 0.9650793650793651, best_val_acc: 0.4166666666666667




best_train_acc: 0.8158730158730159, best_val_acc: 0.5416666666666666




best_train_acc: 0.7142857142857143, best_val_acc: 0.7083333333333334




best_train_acc: 0.4888888888888889, best_val_acc: 0.75




best_train_acc: 0.4984126984126984, best_val_acc: 0.7916666666666666




fold: 0 done!
<class 'generator'>




best_train_acc: 1.0, best_val_acc: 0.4166666666666667




best_train_acc: 0.9339622641509434, best_val_acc: 0.4583333333333333




best_train_acc: 0.7735849056603774, best_val_acc: 0.5416666666666666




best_train_acc: 0.6698113207547169, best_val_acc: 0.7083333333333334




best_train_acc: 0.610062893081761, best_val_acc: 0.75




fold: 1 done!
<class 'generator'>




best_train_acc: 1.0, best_val_acc: 0.4166666666666667




best_train_acc: 0.9622641509433962, best_val_acc: 0.4583333333333333




best_train_acc: 0.8584905660377359, best_val_acc: 0.6666666666666666




best_train_acc: 0.6320754716981132, best_val_acc: 0.7083333333333334




best_train_acc: 0.6698113207547169, best_val_acc: 0.75




fold: 2 done!
<class 'generator'>




best_train_acc: 1.0, best_val_acc: 0.375




best_train_acc: 0.9619047619047619, best_val_acc: 0.4583333333333333




best_train_acc: 0.8285714285714286, best_val_acc: 0.5




best_train_acc: 0.7365079365079366, best_val_acc: 0.7083333333333334




best_train_acc: 1.0, best_val_acc: 0.75




fold: 3 done!
<class 'generator'>




best_train_acc: 1.0, best_val_acc: 0.375




best_train_acc: 0.9619047619047619, best_val_acc: 0.5416666666666666




best_train_acc: 0.8444444444444444, best_val_acc: 0.7916666666666666




best_train_acc: 0.8412698412698413, best_val_acc: 0.8333333333333334




fold: 4 done!
<class 'generator'>




best_train_acc: 1.0, best_val_acc: 0.391304347826087




best_train_acc: 0.9619047619047619, best_val_acc: 0.5217391304347826




best_train_acc: 0.7238095238095238, best_val_acc: 0.5652173913043478




best_train_acc: 0.6063492063492063, best_val_acc: 0.7391304347826086




best_train_acc: 0.5142857142857142, best_val_acc: 0.782608695652174




fold: 5 done!
<class 'generator'>




best_train_acc: 1.0, best_val_acc: 0.391304347826087




best_train_acc: 0.9619047619047619, best_val_acc: 0.4782608695652174




best_train_acc: 0.7015873015873015, best_val_acc: 0.5217391304347826




best_train_acc: 0.7968253968253968, best_val_acc: 0.5652173913043478




best_train_acc: 0.47619047619047616, best_val_acc: 0.6086956521739131




fold: 6 done!
<class 'generator'>




best_train_acc: 1.0, best_val_acc: 0.391304347826087




best_train_acc: 0.9555555555555556, best_val_acc: 0.43478260869565216




best_train_acc: 0.8063492063492064, best_val_acc: 0.4782608695652174




best_train_acc: 0.7111111111111111, best_val_acc: 0.5652173913043478




fold: 7 done!
<class 'generator'>




best_train_acc: 1.0, best_val_acc: 0.391304347826087




best_train_acc: 0.9523809523809523, best_val_acc: 0.43478260869565216




best_train_acc: 0.7936507936507936, best_val_acc: 0.4782608695652174




best_train_acc: 0.692063492063492, best_val_acc: 0.6521739130434783




fold: 8 done!
<class 'generator'>




best_train_acc: 1.0, best_val_acc: 0.391304347826087




best_train_acc: 0.9591194968553459, best_val_acc: 0.4782608695652174




best_train_acc: 1.0, best_val_acc: 0.5217391304347826




best_train_acc: 0.6635220125786163, best_val_acc: 0.6956521739130435




fold: 9 done!
[0.4984126984126984, 0.610062893081761, 0.6698113207547169, 1.0, 0.8412698412698413, 0.5142857142857142, 0.47619047619047616, 0.7111111111111111, 0.692063492063492, 0.6635220125786163]
[0.7916666666666666, 0.75, 0.75, 0.75, 0.8333333333333334, 0.782608695652174, 0.6086956521739131, 0.5652173913043478, 0.6521739130434783, 0.6956521739130435]
train_acc_mean: 0.668, std: 0.153, val_acc_mean: 0.718, std: 0.081


# 4. Model Testing

In [5]:
best_fold = 4

model_path = f'regression/ImageNet/aug3/KRR_{best_fold}_model.sav'
# Fitting different train in the same hyperparameters
best_test, root_mean_squared_error, confusion_matrix_test = model_test_regression(train_val_X, train_val_Y, train_val_groups,test_X, test_Y, best_fold, model_path)

print('best_test: ', best_test)
print('rmse: ', root_mean_squared_error)
print(f'test_acc_mean: {np.mean(best_test) :.3f}, std: {np.std(best_test) :.3f}, rmse_mean: {np.mean(root_mean_squared_error)}, rmse_std: {np.std(root_mean_squared_error)}')

print("......")

print('5th confusion matrix: ', [list(item) for item in confusion_matrix_test[4]])

<class 'generator'>




<class 'generator'>
<class 'generator'>




<class 'generator'>
<class 'generator'>




<class 'generator'>
<class 'generator'>




<class 'generator'>
<class 'generator'>
<class 'generator'>
best_test:  [0.6296296296296297, 0.6666666666666666, 0.6666666666666666, 0.6296296296296297, 0.5925925925925926, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6296296296296297]
rmse:  []
test_acc_mean: 0.648, std: 0.025, rmse_mean: nan, rmse_std: nan
......
5th confusion matrix:  [[1, 1, 1], [2, 6, 5], [0, 2, 9]]


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [3]:
best_fold = 4
PATH = f'Model/Reg/TrainedImageNet/Up/RNN_{best_fold}_model.sav'
#PATH = os.path.join('Model', model_path)
loaded_model = pickle.load(open(PATH, 'rb'))
loaded_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'p': 3,
 'radius': 4,
 'weights': 'uniform'}