In [None]:
from google.colab import drive
drive.mount('/content/drive')
import zipfile, os, urllib.request, glob, math
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_sample_weight

random.seed(9)

Mounted at /content/drive


In [None]:
metadata = pd.read_csv('drive/MyDrive/metadata_processed.csv')
metadata.head()

Unnamed: 0,patient_id,lesion_id,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,...,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed
0,PAT_1516,1765,UNK,UNK,UNK,UNK,8,UNK,UNK,UNK,...,-5.0,3,False,False,False,False,False,False,PAT_1516_1765_530.png,False
1,PAT_46,881,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,...,5.0,1,True,True,False,True,True,True,PAT_46_881_939.png,True
2,PAT_1545,1867,UNK,UNK,UNK,UNK,77,UNK,UNK,UNK,...,-5.0,0,True,False,False,False,False,False,PAT_1545_1867_547.png,False
3,PAT_1989,4061,UNK,UNK,UNK,UNK,75,UNK,UNK,UNK,...,-5.0,0,True,False,False,False,False,False,PAT_1989_4061_934.png,False
4,PAT_684,1302,False,True,POMERANIA,POMERANIA,79,False,MALE,True,...,5.0,1,True,True,False,False,True,True,PAT_684_1302_588.png,True


Προσδιορισμός του τρόπου που θα γίνει η επεξεργασία και ανάλυση των μεταδεδομένων.

In [None]:
PREDICT_MISSING_VALUES = False #Αν οι άγνωστες τιμές θα συμπληρωθούν κάνοντας προβλέψεις βάσει των υπόλοιπων έγκυρων τιμών.
RANDOM_IMPUTATION = True
ZERO_ENCODING = False

if RANDOM_IMPUTATION == True:
  ZERO_ENCODING = False
  PREDICT_MISSING_VALUES = False
elif PREDICT_MISSING_VALUES == True:
  ZERO_ENCODING = False
  RANDOM_IMPUTATION = False
else:
  PREDICT_MISSING_VALUES = False
  RANDOM_IMPUTATION = False
  ZERO_ENCODING = True

LESION_SPLIT = True #Όλες οι εικόνες μιας πληγής θα τοποθετηθούν στο ίδιο fold, ώστε να μην υπάρχει διαρροή πληροφορίας μεταξύ train / test split.
PATIENT_SPLIT = False #Όλες οι εικόνες ενός ασθενή θα τοποθετηθούν στο ίδιο fold, ώστε να μην υπάρχει διαρροή πληροφορίας μεταξύ train / test split.
RANDOM_SPLIT = False # Τυχαίος διαχωρισμός των εικόνων σε κάθε fold.

if LESION_SPLIT == True:
  PATIENT_SPLIT = False
  RANDOM_SPLIT = False
elif PATIENT_SPLIT == True:
  LESION_SPLIT = False
  RANDOM_SPLIT = False
else:
  LESION_SPLIT = False
  PATIENT_SPLIT = False
  RANDOM_SPLIT = True

Πειραματισμός με διαφορετικά πεδία για είσοδο κάθε φορά.

In [None]:
#exclude_features = []
exclude_features = ['background_father','background_mother','diameter_1', 'diameter_2']
#exclude_features = ['background_father','background_mother','diameter_1', 'diameter_2', 'fitspatrick']
#exclude_features = ['background_father','background_mother','diameter_1', 'diameter_2', 'fitspatrick', 'pesticide', 'smoke', 'drink']
#exclude_features = ['background_father','background_mother','diameter_1', 'diameter_2', 'fitspatrick', 'pesticide', 'smoke', 'drink', 'has_piped_water', 'has_sewage_system']
#exclude_features = ['background_father','background_mother','diameter_1', 'diameter_2', 'fitspatrick', 'pesticide', 'smoke', 'drink', 'has_piped_water', 'has_sewage_system', 'skin_cancer_history', 'cancer_history']
#exclude_features = ['background_father','background_mother','diameter_1', 'diameter_2', 'fitspatrick', 'pesticide', 'smoke', 'drink', 'has_piped_water', 'has_sewage_system', 'gender', 'skin_cancer_history', 'cancer_history']

metadata.drop(columns = exclude_features, inplace = True)

non_features = ['patient_id','img_id', 'lesion_id','biopsed']
target = ['diagnostic']

numerical_vars = []
for i in ['age','diameter_1','diameter_2']:
  if i in exclude_features:
    continue
  numerical_vars.append(i)

categorical_vars = sorted(list(set(metadata.columns.tolist()) - set(numerical_vars) - set(non_features) - set(target)))

print("Numerical variables: ",numerical_vars)
print("Categorical variables: ",categorical_vars)

total_features = categorical_vars + numerical_vars

Numerical variables:  ['age']
Categorical variables:  ['bleed', 'cancer_history', 'changed', 'drink', 'elevation', 'fitspatrick', 'gender', 'grew', 'has_piped_water', 'has_sewage_system', 'hurt', 'itch', 'pesticide', 'region', 'skin_cancer_history', 'smoke']


In [None]:
if LESION_SPLIT:
  metadata_wdp = metadata.drop_duplicates(subset = ['lesion_id']).copy(deep = True)
  metadata_wdp = metadata_wdp.sample(frac = 1).reset_index(drop = True)
elif PATIENT_SPLIT:
  metadata_wdp = metadata.drop_duplicates(subset = ['patient_id']).copy(deep = True)
  metadata_wdp = metadata_wdp.sample(frac = 1).reset_index(drop = True)
else:
  metadata_wdp = metadata.copy(deep = True)

skf = StratifiedKFold(n_splits=5)
for i, (_, test_idx) in enumerate(skf.split(metadata_wdp['img_id'].tolist(), metadata_wdp['diagnostic'].tolist())):

  print("Fold %d , length of test set: %d " %(i,len(test_idx)))
  to_append_list = [0] * len(metadata_wdp)
  for j in range(len(to_append_list)): #τωρα παρατηρησα οτι θα μπορουσα να διατρεχω το test_idx
    if j in test_idx:
      to_append_list[j] = 1

  column_name = 'Fold_' + str(i)
  metadata_wdp[column_name] = to_append_list

metadata_wdp.head()

Fold 0 , length of test set: 329 
Fold 1 , length of test set: 328 
Fold 2 , length of test set: 328 
Fold 3 , length of test set: 328 
Fold 4 , length of test set: 328 


Unnamed: 0,patient_id,lesion_id,smoke,drink,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,...,changed,bleed,elevation,img_id,biopsed,Fold_0,Fold_1,Fold_2,Fold_3,Fold_4
0,PAT_637,1434,True,True,69,False,MALE,False,True,True,...,False,False,True,PAT_637_1434_570.png,True,1,0,0,0,0
1,PAT_2065,4413,UNK,UNK,88,UNK,UNK,UNK,UNK,UNK,...,False,False,True,PAT_2065_4413_298.png,False,1,0,0,0,0
2,PAT_919,1744,False,False,65,False,FEMALE,False,False,True,...,True,True,True,PAT_919_1744_318.png,True,1,0,0,0,0
3,PAT_27,38,False,False,70,False,FEMALE,True,True,True,...,False,False,True,PAT_27_38_240.png,True,1,0,0,0,0
4,PAT_361,1568,False,False,71,True,MALE,True,False,False,...,False,False,False,PAT_361_1568_456.png,True,1,0,0,0,0


Ορισμός διάφορων συναρτήσεων για προετοιμασία των εισόδων στο Random Forest.

In [None]:
def create_random_list(choices, probabilities, no_randoms):

  random_list = []
  for i in range(no_randoms):
    random_list.append(random.choices(choices, probabilities)[0])
  return random_list

def frequency_imputation_1(train_dataset, test_dataset, column):

  known_info_patients = train_dataset.drop_duplicates(['patient_id'])
  known_info_patients = known_info_patients.loc[known_info_patients[column] != 'UNK']

  value_counts = known_info_patients[column].value_counts().to_dict()
  print(value_counts)

  sum = 0
  for i in value_counts.items():
    sum += i[1]

  for i in value_counts.items():
    value_counts[i[0]] = i[1]/sum

  replacement_choices = list(value_counts.keys())
  replacement_probabilities = list(value_counts.values())

  for patient in train_dataset.patient_id.unique():
    random_value = random.choices(replacement_choices, replacement_probabilities)[0]
    train_dataset.loc[(train_dataset['patient_id'] == patient) & (train_dataset[column] == 'UNK'), column] = random_value

  for patient in test_dataset.patient_id.unique():
    random_value = random.choices(replacement_choices, replacement_probabilities)[0]
    test_dataset.loc[(test_dataset['patient_id'] == patient) & (test_dataset[column] == 'UNK'), column] = random_value

def frequency_imputation(train_dataset, test_dataset, column):

  value_counts = train_dataset[column].value_counts().to_dict()

  if 'UNK' in value_counts:
    del value_counts['UNK']

  sum = 0
  for i in value_counts.items():
    sum += i[1]

  for i in value_counts.items():
    value_counts[i[0]] = i[1]/sum

  replacement_choices = list(value_counts.keys())
  replacement_probabilities = list(value_counts.values())

  train_dataset.loc[train_dataset[column] == 'UNK', column] = create_random_list(replacement_choices,replacement_probabilities, len(train_dataset.loc[train_dataset[column] == 'UNK']))
  test_dataset.loc[test_dataset[column] == 'UNK', column] = create_random_list(replacement_choices,replacement_probabilities, len(test_dataset.loc[test_dataset[column] == 'UNK']))
  """
  for lesion in train_dataset.lesion_id.unique():
    random_value = random.choices(replacement_choices, replacement_probabilities)[0]
    train_dataset.loc[(train_dataset['lesion_id'] == lesion) & (train_dataset[column] == 'UNK'), column] = random_value

  for lesion in test_dataset.lesion_id.unique():
    random_value = random.choices(replacement_choices, replacement_probabilities)[0]
    test_dataset.loc[(test_dataset['lesion_id'] == lesion) & (test_dataset[column] == 'UNK'), column] = random_value
  """

#υποθέτω ότι κανένα απτα use_features δεν περιέχει UNK τιμές. Οι κατηγορικές τιμές των use_features δεν πρέπει να είναι κωδικοποιημένες.
def predict_missing_cat_values(fit_dataset, use_features ,target_feature, fill_datasets):

  train_df = fit_dataset.loc[fit_dataset[target_feature] != 'UNK'][use_features + [target_feature]].copy(deep = True)

  cat_vars = [cat_f for cat_f in use_features if cat_f in categorical_vars]

  #encode target_feature
  le_target = LabelEncoder()
  le_target.fit(train_df[target_feature])

  train_df[target_feature] = le_target.transform(train_df[target_feature])

  cat_encoders = one_hot_encode_fit(train_df, cat_vars)
  train_df = one_hot_encode_transform(train_df, cat_vars, cat_encoders)

  model = RandomForestClassifier(class_weight='balanced')
  sample_weight = compute_sample_weight('balanced', train_df[target_feature])
  model.fit(train_df.drop(columns = [target_feature]), train_df[target_feature], sample_weight = sample_weight)

  for current_dataset in fill_datasets:

    unknown_indeces = (current_dataset[target_feature] == 'UNK')
    fill_dataset = one_hot_encode_transform(current_dataset.loc[unknown_indeces][use_features], cat_vars, cat_encoders)

    predictions = model.predict(fill_dataset)

    current_dataset.loc[unknown_indeces, [target_feature]] = le_target.inverse_transform(predictions)

def split_numpy_to_cols(dataset, array_encodings, column):

  for cols in range(array_encodings.shape[1]):
      column_data = array_encodings[:, cols]
      dataset[column + '_' + str(cols)] = column_data

  dataset.drop(columns = [column], inplace = True)

def one_hot_encode_fit(fit_dataset, columns):

  cat_encoders = []
  for cat_var in columns:
    unique_values = sorted(fit_dataset[cat_var].unique().tolist())
    cat_encoders.append(unique_values)

  return cat_encoders

def one_hot_encode_transform(transform_dataset, columns, encoders):

  temp_dataset = transform_dataset.copy(deep = True)

  for index in range(len(columns)):

    cat_var = columns[index]
    encoder = encoders[index]

    encoded_values = []
    for i in temp_dataset[cat_var]:

      encoded_value = [0] * len(encoder)
      if i not in encoder:
        encoded_values.append(encoded_value)
        continue

      encoded_value[encoder.index(i)] = 1
      encoded_values.append(np.array(encoded_value))

    array_encodings = np.array(encoded_values)
    split_numpy_to_cols(temp_dataset, array_encodings, cat_var)

  return temp_dataset

#dinei ypsilotera apotelesmata, alla den einai sosto, opos anefera sto data processing.
def zero_encoding_unknowns(train_dataset, test_dataset, columns):

  train_encoded_dataset = train_dataset.copy(deep = True)
  test_encoded_dataset = test_dataset.copy(deep = True)

  for cat_var in columns:

    unique_values = sorted(train_encoded_dataset[cat_var].unique().tolist())

    if 'UNK' in unique_values:
      #del unique_values['UNK']
      unique_values.remove('UNK')

    train_encoded_values = []
    for i in train_encoded_dataset[cat_var]:

      encoded_value = [0] * len(unique_values)

      if i in unique_values:
        encoded_value[unique_values.index(i)] = 1

      train_encoded_values.append(np.array(encoded_value))

    split_numpy_to_cols(train_encoded_dataset, np.array(train_encoded_values), cat_var)

    test_encoded_values = []
    for i in test_encoded_dataset[cat_var]:

      encoded_value = [0] * len(unique_values)

      if i in unique_values:
        encoded_value[unique_values.index(i)] = 1

      test_encoded_values.append(np.array(encoded_value))

    split_numpy_to_cols(test_encoded_dataset, np.array(test_encoded_values), cat_var)

  return train_encoded_dataset, test_encoded_dataset

def one_hot_encode_fit_transform(train_dataset, test_dataset, columns):

  encoded_train_dataset = train_dataset.copy(deep = True)
  encoded_test_dataset = test_dataset.copy(deep = True)

  for cat_var in columns:

    unique_values = sorted(encoded_train_dataset[cat_var].unique().tolist())

    encoded_values = []
    for i in encoded_train_dataset[cat_var]:

      encoded_value = [0] * len(unique_values)
      encoded_value[unique_values.index(i)] = 1
      encoded_values.append(np.array(encoded_value))

    train_array_encodings = np.array(encoded_values)
    split_numpy_to_cols(encoded_train_dataset, train_array_encodings, cat_var)

    encoded_values = []
    for i in encoded_test_dataset[cat_var]:

      encoded_value = [0] * len(unique_values)
      if i not in unique_values:
        encoded_values.append(np.array(encoded_value))
        continue

      encoded_value[unique_values.index(i)] = 1
      encoded_values.append(np.array(encoded_value))

    test_array_encodings = np.array(encoded_values)
    split_numpy_to_cols(encoded_test_dataset, test_array_encodings, cat_var)

  return (encoded_train_dataset, encoded_test_dataset)

def process_numerical_features(train_dataset,test_dataset, columns):

  processed_train_dataset = train_dataset.copy(deep = True)
  processed_test_dataset = test_dataset.copy(deep = True)

  for j in columns:

    fit_data = np.array(processed_train_dataset.loc[processed_train_dataset[j] != -5][j]).reshape(-1,1)

    scaler = StandardScaler()
    scaler.fit(np.array(fit_data))

    processed_train_dataset.loc[processed_train_dataset[j] == -5, [j]] = scaler.mean_
    processed_test_dataset.loc[processed_test_dataset[j] == -5, [j]] = scaler.mean_

    processed_train_dataset[j] = scaler.transform(np.array(processed_train_dataset[j]).reshape(-1,1))
    processed_test_dataset[j] = scaler.transform(np.array(processed_test_dataset[j]).reshape(-1,1))

  return (processed_train_dataset, processed_test_dataset)

Διαχωρισμός των δεδομένων.

In [None]:
train_dfs = []
test_dfs = []

for i in ['Fold_0','Fold_1','Fold_2','Fold_3','Fold_4']:

  if LESION_SPLIT or PATIENT_SPLIT:
    temp_train_dataset = metadata_wdp.loc[metadata_wdp[i] == 0]
    temp_test_dataset = metadata_wdp.loc[metadata_wdp[i] == 1]

    if LESION_SPLIT:
      final_train_dataset = metadata[metadata['lesion_id'].isin(temp_train_dataset['lesion_id'])].reset_index(drop = True)
      final_test_dataset = metadata[metadata['lesion_id'].isin(temp_test_dataset['lesion_id'])].reset_index(drop = True)
    else:
      final_train_dataset = metadata[metadata['patient_id'].isin(temp_train_dataset['patient_id'])].reset_index(drop = True)
      final_test_dataset = metadata[metadata['patient_id'].isin(temp_test_dataset['patient_id'])].reset_index(drop = True)

  else:
    final_train_dataset = metadata_wdp.loc[metadata_wdp[i] == 0].drop(columns = ['Fold_0', 'Fold_1','Fold_2','Fold_3','Fold_4'])
    final_test_dataset = metadata_wdp.loc[metadata_wdp[i] == 1].drop(columns = ['Fold_0', 'Fold_1','Fold_2','Fold_3','Fold_4'])

  print("Fold number: ", i)
  print("Length of final train dataset:", len(final_train_dataset))
  print("Length of final test dataset:", len(final_test_dataset))

  final_train_dataset, final_test_dataset = process_numerical_features(final_train_dataset, final_test_dataset, numerical_vars)

  if PREDICT_MISSING_VALUES:
    for j in list(set(categorical_vars) - set(['grew', 'changed'])):
      frequency_imputation(final_train_dataset, final_test_dataset, j)

    predict_missing_cat_values(final_train_dataset, ['age', 'bleed', 'itch', 'hurt', 'elevation', 'region'] ,'grew', [final_train_dataset, final_test_dataset])
    predict_missing_cat_values(final_train_dataset, ['age', 'bleed', 'itch', 'hurt', 'elevation', 'region'] ,'changed', [final_train_dataset, final_test_dataset])

  elif RANDOM_IMPUTATION:
    for j in categorical_vars:
      frequency_imputation(final_train_dataset, final_test_dataset, j)

  final_train_dataset.drop(columns = non_features, inplace=True)
  final_test_dataset.drop(columns = non_features, inplace=True)

  train_dfs.append(final_train_dataset)
  test_dfs.append(final_test_dataset)

train_dfs[0].head()

Fold number:  Fold_0
Length of final train dataset: 1854
Length of final test dataset: 444
Fold number:  Fold_1
Length of final train dataset: 1839
Length of final test dataset: 459
Fold number:  Fold_2
Length of final train dataset: 1840
Length of final test dataset: 458
Fold number:  Fold_3
Length of final train dataset: 1831
Length of final test dataset: 467
Fold number:  Fold_4
Length of final train dataset: 1828
Length of final test dataset: 470


Unnamed: 0,smoke,drink,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,has_sewage_system,fitspatrick,region,diagnostic,itch,grew,hurt,changed,bleed,elevation
0,False,True,-3.324303,False,FEMALE,False,False,False,True,2.0,ARM,3,False,False,False,False,False,False
1,False,False,-0.348169,False,FEMALE,True,True,True,True,3.0,NECK,1,True,True,False,True,True,True
2,False,True,1.044916,True,FEMALE,False,True,True,True,2.0,FACE,0,True,False,False,False,False,False
3,False,False,0.918272,True,FEMALE,True,True,False,True,2.0,HAND,0,True,False,False,False,False,False
4,False,True,1.17156,False,MALE,True,False,False,False,1.0,FOREARM,1,True,True,False,False,True,True


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

def fit_model(model, Xs, Ys, use_sample_weight = True):

  X_train, X_test = Xs
  y_train, y_test = Ys

  sample_weight = None
  if use_sample_weight:
    sample_weight = compute_sample_weight('balanced', y_train)

  model.fit(X_train, y_train, sample_weight = sample_weight)
  y_pred = model.predict(X_test)
  #print(classification_report(y_test, y_pred))
  #print(f1_score(y_test, y_pred, average = 'macro'))
  return f1_score(y_test, y_pred, average = 'macro')

def evaluate_with_features(categorical_features, numerical_features):

  f1_scores = []
  for i in range(5):
    temp_model = RandomForestClassifier(n_estimators=500, class_weight='balanced')

    if LESION_SPLIT or PATIENT_SPLIT:
      train_dataset = train_dfs[i][categorical_features + numerical_features + target]
      test_dataset = test_dfs[i][categorical_features + numerical_features + target]
      train_dataset, test_dataset = one_hot_encode_fit_transform(train_dataset, test_dataset, categorical_features)

    else:
      train_dataset = train_dfs[i][categorical_features + numerical_features + target]
      test_dataset = test_dfs[i][categorical_features + numerical_features + target]
      train_dataset, test_dataset = zero_encoding_unknowns(train_dataset, test_dataset, categorical_features)

    f1_scores.append(fit_model(temp_model,(train_dataset.drop(columns = target), test_dataset.drop(columns = target)), (train_dataset['diagnostic'], test_dataset['diagnostic'])))

  return (np.array(f1_scores).mean(), np.array(f1_scores).std())

Χωρίς να διαχωρίζονται σε ξεχωριστά folds τα lesions και οι άγνωστες τιμές λαμβάνουν διάνυσμα με μηδενικά: (Βλέπουμε ότι η επίδοση έχει γίνει πολύ υψηλή, αλλά είναι λάθος να γίνει έτσι ο διαχωρισμός).


In [None]:
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'has_sewage_system', 'has_piped_water'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'fitspatrick'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'skin_cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch',  'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt','has_sewage_system', 'has_piped_water',  'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink', 'pesticide'], ['age']))

(0.7291058713490853, 0.014586838613854042)
(0.6791975343875397, 0.026770037000468313)
(0.7212825466166743, 0.02348122099739338)
(0.7151657433972158, 0.027482409509981896)
(0.7205538179644315, 0.020110299589583127)
(0.5939848885275751, 0.025746724277723974)
(0.7265205949855262, 0.0208270767727135)
(0.7261463210340066, 0.027308320424035238)
(0.7335807865510071, 0.022315995645720354)


Κάνοντας διαχωρισμό τα lesions σε ξεχωριστά folds και οι άγνωστες τιμές λαμβάνουν μηδενικά (Πάλι βλέπουμε ότι η επίδοση είναι υψηλή, χωρίς ωστόσο να αντιπροσωπεύει την πραγματικότητα).


In [None]:
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'has_sewage_system', 'has_piped_water'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'fitspatrick'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'skin_cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch',  'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt','has_sewage_system', 'has_piped_water',  'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink', 'pesticide'], ['age']))

(0.5612431896023227, 0.045643914507917036)
(0.5309265853247742, 0.04022198177859248)
(0.585032463826557, 0.03364948336466123)
(0.5662158487542924, 0.04238978005171056)
(0.5689100306322886, 0.024873379316096254)
(0.43452760532868184, 0.013328157189705665)
(0.5582218654843311, 0.019680249445448765)
(0.5398576271490151, 0.036100944243989304)
(0.5699529465635294, 0.025561501279526312)




Χωρίς να διαχωρίζονται σε ξεχωριστά folds τα lesions αλλά με random τιμές οι άγνωστες τιμές. Φαίνεται ξανά πως αυξάνεται η επίδοση.


In [None]:
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'has_sewage_system', 'has_piped_water'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'fitspatrick'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'skin_cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch',  'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt','has_sewage_system', 'has_piped_water',  'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink', 'pesticide'], ['age']))

(0.6500619946859075, 0.031630948946165464)
(0.6384002154051235, 0.034900192319020966)
(0.6344785102880377, 0.03444513485402514)
(0.6422771654826789, 0.02639181415611654)
(0.638308856114178, 0.022318892025559593)
(0.5960647883851657, 0.02343507522103264)
(0.652181130481433, 0.03393624361459005)
(0.6588536867703935, 0.025281558739384914)
(0.6704112758826071, 0.026795698397553098)


Κάνοντας διαχωρισμό στα lesions σε ξεχωριστά folds, και οι άγνωστες τιμές λαμβάνουν τυχαίες τιμές.

In [None]:
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'has_sewage_system', 'has_piped_water'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'fitspatrick'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'skin_cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch',  'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt','has_sewage_system', 'has_piped_water',  'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink', 'pesticide'], ['age']))

(0.4936024882191302, 0.03142051234613527)
(0.5018955224210444, 0.030726253113677073)
(0.49968987940440657, 0.04235403221296108)
(0.4931150932773491, 0.04053342191747695)
(0.4912829359720427, 0.02291079686551815)
(0.4393157724467116, 0.013774990047930034)
(0.47725885810390684, 0.032836712398175705)
(0.4907127413165987, 0.02844846965634371)
(0.4956602959616, 0.032262255032112955)


Κάνοντας διαχωρισμό τα patients σε ξεχωριστά folds, και οι άγνωστες τιμές λαμβάνουν τυχαίες τιμές.

In [None]:
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'has_sewage_system', 'has_piped_water'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'fitspatrick'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'skin_cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch',  'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt','has_sewage_system', 'has_piped_water',  'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink', 'pesticide'], ['age']))

(0.48123844805508664, 0.04019755116803785)
(0.4875280141833498, 0.0265343652707443)
(0.49300991865067684, 0.034062353789074504)
(0.4966993680867396, 0.027152388962120046)
(0.4627687143419633, 0.03456716574439033)
(0.42455257918402695, 0.022320790472602048)
(0.4640392996346092, 0.04560531932142606)
(0.46478133230841934, 0.03137415315960216)
(0.4983125801671645, 0.03280759933363214)


Κάνοντας διαχωρισμό στα lesions σε ξεχωριστά folds, και οι τιμές grew, changed λαμβάνουν τιμές βάσει προβλέψεων.

In [None]:
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'has_sewage_system', 'has_piped_water'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'fitspatrick'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'skin_cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch',  'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt','has_sewage_system', 'has_piped_water',  'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink', 'pesticide'], ['age']))

(0.47219241181643057, 0.02341002861832171)
(0.49140062441023125, 0.038642948585914494)
(0.46425278412621723, 0.025078599091805456)
(0.48879746572195587, 0.03858239769969429)
(0.4683939366369467, 0.028720066431523725)
(0.4318451097749577, 0.020609482962847054)
(0.46513615039323025, 0.038138719840708046)
(0.4677269134552794, 0.040918544719845386)
(0.49653842824860134, 0.026506200481844454)


Αναλυτική δοκιμή των δεδομένων, ώστε να προσδιοριστεί ο συνδυασμός δεδομένων που θα επιφέρει το μέγιστο αποτέλεσμα. Διαχωρισμός βάσει lesions και random imputation.

In [None]:
chosen_features = []

best_score = 0
while True:
  best_feature = None

  for feature in [f for f in total_features if f not in chosen_features]:

    candidate_features = chosen_features + [feature]

    categorical_features = [cat_f for cat_f in candidate_features if cat_f in categorical_vars]
    numerical_features = [num_f for num_f in candidate_features if num_f in numerical_vars]

    f_score = evaluate_with_features(categorical_features, numerical_features)

    if f_score > best_score:
        best_score = f_score
        best_feature = feature

  if best_feature is not None:
      chosen_features.append(best_feature)
      print("Added feature: ", best_feature)
      print("F-score: ", best_score)
  else:
      break

print("Chosen features:", chosen_features)

Added feature:  age
F-score:  0.15064018299321497
Added feature:  region
F-score:  0.30587665792684743
Added feature:  bleed
F-score:  0.35710286343937037
Added feature:  elevation
F-score:  0.38847545776257364
Added feature:  itch
F-score:  0.4279287032063427
Added feature:  changed
F-score:  0.4617073971105622
Added feature:  grew
F-score:  0.4779630862194811
Added feature:  hurt
F-score:  0.4849260781449619
Chosen features: ['age', 'region', 'bleed', 'elevation', 'itch', 'changed', 'grew', 'hurt']
