In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, f1_score, make_scorer

In [3]:
df = pd.read_excel('/home/sam/tirocinio/DB PAROTIDE DEF_corretto.xlsx', sheet_name='Sheet7')
df = df.loc[df['ESCLUDERE'] == 0]
df.reset_index(inplace = True)
df = df.loc[df['COD ISTO'] != 6]
df.reset_index(inplace = True)
df = df[['Segni macro malignità', 'ADC', 'TIC type', 'T2','COD ISTO']]
df['ADC'] = df['ADC'].apply(lambda s: str(s).replace(',', '.')).astype(float)
df.head(10)

Unnamed: 0,Segni macro malignità,ADC,TIC type,T2,COD ISTO
0,0,1.6,A,1,3
1,0,1.0,B,0,4
2,0,1.2,C,0,3
3,0,0.77,B,1,4
4,0,0.8,C,1,1
5,1,0.7,C,1,1
6,0,0.9,B,1,4
7,0,0.7,B,1,4
8,0,0.8,B,1,4
9,0,1.4,A,0,3


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Segni macro malignità  103 non-null    int64  
 1   ADC                    103 non-null    float64
 2   TIC type               103 non-null    object 
 3   T2                     103 non-null    int64  
 4   COD ISTO               103 non-null    int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 4.1+ KB


In [5]:
df['COD ISTO'].value_counts()

3    53
4    27
1    12
5     6
2     5
Name: COD ISTO, dtype: int64

In [6]:
y = df['COD ISTO'].copy()
X_not_encoded = df[['Segni macro malignità', 'ADC', 'TIC type', 'T2']].copy()
X = pd.get_dummies(X_not_encoded, columns = ['TIC type'])

In [31]:
X.head(10)

Unnamed: 0,Segni macro malignità,ADC,T2,TIC type_A,TIC type_B,TIC type_C
0,0,1.6,1,1,0,0
1,0,1.0,0,0,1,0
2,0,1.2,0,0,0,1
3,0,0.77,1,0,1,0
4,0,0.8,1,0,0,1
5,1,0.7,1,0,0,1
6,0,0.9,1,0,1,0
7,0,0.7,1,0,1,0
8,0,0.8,1,0,1,0
9,0,1.4,0,1,0,0


## Funzione dell'esperimento

In [66]:
def esperimento(X,y, test_size = 0.25, ordered_labels = (1,2,4,3,5),
                random_state_tree = None):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, 
                                                        shuffle = True,
                                                        random_state = random_state_tree,
                                                        stratify = y)
    
    #array che contiene le labels prodotte
    y_pred = np.zeros(len(X))
    
    n_it = 0
    while (len(X_train) > 0 and len(X_test) > 0):
        
        print('### ITERATION {} ###\n'.format(n_it+1))
        
        labels_trees_to_train = [x for n,x in enumerate(ordered_labels) if n <= n_it]
        
        print('Trees trained in current iteration: \n')
        print(labels_trees_to_train)
        print()
        print()
        
        #alleno gli alberi
        
       
        probs_train = []
        probs_test = []
        for l in labels_trees_to_train:
            clf = tree.DecisionTreeClassifier(random_state = random_state_tree, max_depth = 1)
            bin_y_train = [1 if y == l else 0 for y in y_train]
            clf.fit(X_train, bin_y_train )
            print('Tree {} vs All\n'.format(l))
            print('Feature importances: \n')
            print(pd.Series(clf.feature_importances_, index = X.columns))
            print()
            print('Treshold:')
            print(clf.tree_.threshold[0])
            print()
            
            if np.shape(clf.predict_proba(X_train).T)[0] == 1:
                current_y_pred_proba_train = np.zeros(len(clf.predict_proba(X_train).T[0]))
            else:
                current_y_pred_proba_train = clf.predict_proba(X_train).T[1]
                
            if np.shape(clf.predict_proba(X_test).T)[0] == 1:
                current_y_pred_proba_test = np.zeros(len(clf.predict_proba(X_test).T[0]))
            else:
                current_y_pred_proba_test = clf.predict_proba(X_test).T[1]
                
            probs_train.append(current_y_pred_proba_train)
            probs_test.append(current_y_pred_proba_test)
            
            
            
        probs_train = np.array(probs_train).transpose()
        probs_test = np.array(probs_test).transpose()
        
        
        
        #assegno labels a training set
        train_indexes_delete = []
        for i, p in enumerate(probs_train):
            if max(p) >= 0.8 or (max(p) > 0.5 and n_it>=4) :
                label = ordered_labels[np.argmax(p)]
                y_pred[list(X_train.index)[i]] = label
                train_indexes_delete.append(i)
        
        
        #assegno labels a test set
        test_indexes_delete = []
        for i, p in enumerate(probs_test):
            if max(p) >= 0.8 or (max(p) > 0.5 and n_it>=4) :
                label = ordered_labels[np.argmax(p)]
                y_pred[list(X_test.index)[i]] = label
                test_indexes_delete.append(i)
                
        
                
        #eliminazione
        X_train = X_train.drop(list(X_train.iloc[train_indexes_delete].index))
        y_train = y_train.drop(list(y_train.iloc[train_indexes_delete].index))
        X_test = X_test.drop(list(X_test.iloc[test_indexes_delete].index))
        y_test = y_test.drop(list(y_test.iloc[test_indexes_delete].index))
            
        print('Labels assigned:\n')
        print(pd.Series(y_pred).value_counts())
        print()
        print('Remaining training set: {}\n'.format(len(X_train)))

        print('Remaining test set: {}\n'.format(len(X_test)))
        
        n_it += 1
        
    print('### Performance evaluation ###\n')
    print(classification_report(y, y_pred, zero_division = 0))

In [67]:
seed = 42

esperimento(X, y, random_state_tree = seed)

### ITERATION 1 ###

Trees trained in current iteration: 

[1]


Tree 1 vs All

Feature importances: 

Segni macro malignità    1.0
ADC                      0.0
T2                       0.0
TIC type_A               0.0
TIC type_B               0.0
TIC type_C               0.0
dtype: float64

Treshold:
0.5

Labels assigned:

0.0    95
1.0     8
dtype: int64

Remaining training set: 70

Remaining test set: 25

### ITERATION 2 ###

Trees trained in current iteration: 

[1, 2]


Tree 1 vs All

Feature importances: 

Segni macro malignità    0.0
ADC                      0.0
T2                       0.0
TIC type_A               0.0
TIC type_B               0.0
TIC type_C               1.0
dtype: float64

Treshold:
0.5

Tree 2 vs All

Feature importances: 

Segni macro malignità    0.0
ADC                      1.0
T2                       0.0
TIC type_A               0.0
TIC type_B               0.0
TIC type_C               0.0
dtype: float64

Treshold:
0.5999999940395355

Labels assigned:

0