# 1. Imports

In [1]:
from sklearn.decomposition import PCA
import matplotlib as mpl 
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler


from codes.utils import stratified_train_test_group_kfold
from codes.utils import model_test_classification

from codes.classification_codes import dtc_gridsearch



# 2. Data

In [2]:
with open("Data/MoS2_Analysis_Data_trained2", "rb") as fp:   # Unpickling
    MoS2_Proj1_Class_Data = pickle.load(fp)

#features = MoS2_ImageNet_100_data[0]
df = pd.DataFrame(MoS2_Proj1_Class_Data)
print(df.keys())
T_dict= {900:0, 950:1, 1000:2}
T_target = np.array([T_dict[item] for item in df['T']])

features = np.array(list(df['ImageNet']))
sampleId = np.array(list(df['sampleId']))

X = features
Y = np.array(T_target)
groups = np.array(sampleId)


train_val_groups, train_val_X, train_val_Y, test_X, test_Y = stratified_train_test_group_kfold(X, Y, groups, n_splits=10, test_fold=0)

train_val_Y = train_val_Y.flatten()
test_Y = test_Y.flatten()

print(train_val_X.shape)
print(train_val_Y.shape)
print(test_X.shape)
print(test_Y.shape)


oversample = RandomOverSampler(sampling_strategy='not majority')

scaler = StandardScaler().fit(train_val_X)
train_val_X = scaler.transform(train_val_X)
test_X = scaler.transform(test_X)

Index(['sampleId', 'sampleLabel', 'image', 'T', 'ImageNet', 'MicroNet'], dtype='object')


# 3. Runing Cross-Validation

In [5]:

Criterion = ['gini', 'entropy', 'log_loss']
Min_samples_split = [i*0.1 for i in range(1, 10, 1)]
Max_features = ['auto', 'sqrt', 'log2', None]

def cross_10_folds_gpc(train_val_X, train_val_Y):
    
    best_train = []
    best_val = []
    best_variables = []
    for fold in range(10):
        #39, 42, 34 is best
        group, train_X, train_Y, val_X, val_Y = stratified_train_test_group_kfold(train_val_X, train_val_Y, train_val_groups, n_splits=10, test_fold=fold)
        
        model_path = f'classification/ImageNet/aug3/DTC_{fold}_model.sav'
        
        train_X, train_Y = oversample.fit_resample(train_X, train_Y)
        variables_best, performance_best = dtc_gridsearch(train_X, train_Y, val_X, val_Y, Criterion, Min_samples_split, Max_features, fold, model_path)
        best_train.append(performance_best['accuracy_train'])
        best_val.append(performance_best['accuracy_val'])
        best_variables.append(variables_best)
        print(f'fold: {fold} done!')
    return best_train, best_val, best_variables


best_train, best_val, best_variables = cross_10_folds_gpc(train_val_X, train_val_Y)
print(best_train)
print(best_val)
print(f'train_acc_mean: {np.mean(best_train) :.3f}, std: {np.std(best_train) :.3f}, val_acc_mean: {np.mean(best_val):.3f}, std: {np.std(best_val):.3f}')

<class 'generator'>




best_train_acc: 0.8634920634920635, best_val_acc: 0.5833333333333334
best_train_acc: 0.8539682539682539, best_val_acc: 0.75
best_train_acc: 0.5904761904761905, best_val_acc: 0.7916666666666666




fold: 0 done!
<class 'generator'>




best_train_acc: 0.8647798742138365, best_val_acc: 0.4583333333333333
best_train_acc: 0.8616352201257862, best_val_acc: 0.5
best_train_acc: 0.8301886792452831, best_val_acc: 0.5833333333333334
best_train_acc: 0.6792452830188679, best_val_acc: 0.7083333333333334
best_train_acc: 0.8459119496855346, best_val_acc: 0.7916666666666666




fold: 1 done!
<class 'generator'>




best_train_acc: 0.7955974842767296, best_val_acc: 0.5833333333333334
best_train_acc: 0.8805031446540881, best_val_acc: 0.625
best_train_acc: 0.6792452830188679, best_val_acc: 0.6666666666666666
best_train_acc: 0.6918238993710691, best_val_acc: 0.7083333333333334




best_train_acc: 0.6761006289308176, best_val_acc: 0.75
best_train_acc: 0.7987421383647799, best_val_acc: 0.7916666666666666




fold: 2 done!
<class 'generator'>




best_train_acc: 0.8761904761904762, best_val_acc: 0.75




best_train_acc: 0.8222222222222222, best_val_acc: 0.7916666666666666




fold: 3 done!
<class 'generator'>




best_train_acc: 0.8793650793650793, best_val_acc: 0.6666666666666666
best_train_acc: 0.9365079365079365, best_val_acc: 0.7083333333333334
best_train_acc: 0.6666666666666666, best_val_acc: 0.75




best_train_acc: 0.6412698412698413, best_val_acc: 0.7916666666666666
best_train_acc: 0.6507936507936508, best_val_acc: 0.8333333333333334




fold: 4 done!
<class 'generator'>




best_train_acc: 0.8698412698412699, best_val_acc: 0.6521739130434783
best_train_acc: 0.8539682539682539, best_val_acc: 0.7391304347826086




best_train_acc: 0.8603174603174604, best_val_acc: 0.782608695652174




fold: 5 done!
<class 'generator'>




best_train_acc: 0.8031746031746032, best_val_acc: 0.43478260869565216
best_train_acc: 0.8158730158730159, best_val_acc: 0.4782608695652174
best_train_acc: 0.8571428571428571, best_val_acc: 0.5217391304347826
best_train_acc: 0.7777777777777778, best_val_acc: 0.5652173913043478
best_train_acc: 0.7682539682539683, best_val_acc: 0.6086956521739131




best_train_acc: 0.726984126984127, best_val_acc: 0.6521739130434783
best_train_acc: 0.8253968253968254, best_val_acc: 0.7391304347826086




fold: 6 done!
<class 'generator'>




best_train_acc: 0.8793650793650793, best_val_acc: 0.4782608695652174
best_train_acc: 0.8571428571428571, best_val_acc: 0.5652173913043478
best_train_acc: 0.7873015873015873, best_val_acc: 0.6956521739130435
best_train_acc: 0.6603174603174603, best_val_acc: 0.7391304347826086




fold: 7 done!
<class 'generator'>




best_train_acc: 0.8793650793650793, best_val_acc: 0.30434782608695654
best_train_acc: 0.7968253968253968, best_val_acc: 0.4782608695652174
best_train_acc: 0.8095238095238095, best_val_acc: 0.5652173913043478
best_train_acc: 0.7206349206349206, best_val_acc: 0.6086956521739131
best_train_acc: 0.6888888888888889, best_val_acc: 0.6956521739130435




fold: 8 done!
<class 'generator'>




best_train_acc: 0.8427672955974843, best_val_acc: 0.5652173913043478
best_train_acc: 0.8553459119496856, best_val_acc: 0.6956521739130435
best_train_acc: 0.9245283018867925, best_val_acc: 0.7391304347826086
best_train_acc: 0.7955974842767296, best_val_acc: 0.8260869565217391




fold: 9 done!
[0.5904761904761905, 0.8459119496855346, 0.7987421383647799, 0.8222222222222222, 0.6507936507936508, 0.8603174603174604, 0.8253968253968254, 0.6603174603174603, 0.6888888888888889, 0.7955974842767296]
[0.7916666666666666, 0.7916666666666666, 0.7916666666666666, 0.7916666666666666, 0.8333333333333334, 0.782608695652174, 0.7391304347826086, 0.7391304347826086, 0.6956521739130435, 0.8260869565217391]
train_acc_mean: 0.754, std: 0.091, val_acc_mean: 0.778, std: 0.040




# 4. Model Testing

In [6]:
best_fold = 1

model_path = f'classification/ImageNet/aug3/DTC_{best_fold}_model.sav'
# Fitting different train in the same hyperparameters
best_test, confusion_matrix_test = model_test_classification(train_val_X, train_val_Y, train_val_groups,test_X, test_Y, best_fold, model_path)



print(best_test)
print(f'test_acc_mean: {np.mean(best_test) :.3f}, std: {np.std(best_test) :.3f}')

print('5th confusion matrix: ', [list(item) for item in confusion_matrix_test[4]])

<class 'generator'>
<class 'generator'>




<class 'generator'>
<class 'generator'>




<class 'generator'>
<class 'generator'>




<class 'generator'>
<class 'generator'>




<class 'generator'>
<class 'generator'>
[0.5925925925925926, 0.6666666666666666, 0.48148148148148145, 0.6296296296296297, 0.7037037037037037, 0.5555555555555556, 0.5555555555555556, 0.4444444444444444, 0.5555555555555556, 0.5185185185185185]
test_acc_mean: 0.570, std: 0.076
5th confusion matrix:  [[1, 2, 0], [0, 9, 4], [0, 2, 9]]




In [7]:
cm_list = []
for cm in confusion_matrix_test:
    cm_list.append([list(item) for item in cm])
#print(cm_list)

print(len(cm_list))
print([list(item) for item in np.mean(cm_list, axis=0)])

10
[[0.7, 2.1, 0.2], [2.1, 7.6, 3.3], [0.6, 3.3, 7.1]]


In [3]:
best_fold = 1
PATH = f'Model/Class/TrainedImageNet/Up/DTC_{best_fold}_model.sav'

#PATH = os.path.join('Models', model_path)
loaded_model = pickle.load(open(PATH, 'rb'))
loaded_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 0.1,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}