In [None]:
from google.colab import drive
drive.mount('/content/drive')
import zipfile, os, urllib.request, glob, math
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_sample_weight

random.seed(9)

Mounted at /content/drive


In [None]:
metadata = pd.read_csv('drive/MyDrive/metadata_processed.csv')
metadata.head()

Unnamed: 0,patient_id,lesion_id,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,...,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed
0,PAT_1516,1765,UNK,UNK,UNK,UNK,8,UNK,UNK,UNK,...,-5.0,3,False,False,False,False,False,False,PAT_1516_1765_530.png,False
1,PAT_46,881,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,...,5.0,1,True,True,False,True,True,True,PAT_46_881_939.png,True
2,PAT_1545,1867,UNK,UNK,UNK,UNK,77,UNK,UNK,UNK,...,-5.0,0,True,False,False,False,False,False,PAT_1545_1867_547.png,False
3,PAT_1989,4061,UNK,UNK,UNK,UNK,75,UNK,UNK,UNK,...,-5.0,0,True,False,False,False,False,False,PAT_1989_4061_934.png,False
4,PAT_684,1302,False,True,POMERANIA,POMERANIA,79,False,MALE,True,...,5.0,1,True,True,False,False,True,True,PAT_684_1302_588.png,True


Πειραματισμός με διαφορετικά πεδία για είσοδο κάθε φορά.

In [None]:
#exclude_features = []
exclude_features = ['background_father','background_mother','diameter_1', 'diameter_2', 'fitspatrick']
#exclude_features = ['background_father','background_mother','diameter_1', 'diameter_2', 'fitspatrick', 'pesticide', 'smoke', 'drink']
#exclude_features = ['background_father','background_mother','diameter_1', 'diameter_2', 'fitspatrick', 'pesticide', 'smoke', 'drink', 'has_piped_water', 'has_sewage_system']
#exclude_features = ['background_father','background_mother','diameter_1', 'diameter_2', 'fitspatrick', 'pesticide', 'smoke', 'drink', 'has_piped_water', 'has_sewage_system', 'gender', 'skin_cancer_history', 'cancer_history']

metadata.drop(columns = exclude_features, inplace = True)

non_features = ['patient_id','img_id', 'lesion_id','biopsed']
target = ['diagnostic']

numerical_vars = []
for i in ['age','diameter_1','diameter_2']:
  if i in exclude_features:
    continue
  numerical_vars.append(i)

categorical_vars = sorted(list(set(metadata.columns.tolist()) - set(numerical_vars) - set(non_features) - set(target)))

print("Numerical variables: ",numerical_vars)
print("Categorical variables: ",categorical_vars)

total_features = categorical_vars + numerical_vars

Numerical variables:  ['age']
Categorical variables:  ['bleed', 'cancer_history', 'changed', 'drink', 'elevation', 'gender', 'grew', 'has_piped_water', 'has_sewage_system', 'hurt', 'itch', 'pesticide', 'region', 'skin_cancer_history', 'smoke']


In [None]:
metadata_wdp = metadata.drop_duplicates(subset = ['lesion_id']).copy(deep = True)

skf = StratifiedKFold(n_splits=5)
for i, (_, test_idx) in enumerate(skf.split(metadata_wdp['img_id'].tolist(), metadata_wdp['diagnostic'].tolist())):

  print("Fold %d , length of test set: %d " %(i,len(test_idx)))
  to_append_list = [0] * len(metadata_wdp)
  for j in range(len(to_append_list)): #τωρα παρατηρησα οτι θα μπορουσα να διατρεχω το test_idx
    if j in test_idx:
      to_append_list[j] = 1

  column_name = 'Fold_' + str(i)
  metadata_wdp[column_name] = to_append_list

metadata_wdp.head()

Fold 0 , length of test set: 329 
Fold 1 , length of test set: 328 
Fold 2 , length of test set: 328 
Fold 3 , length of test set: 328 
Fold 4 , length of test set: 328 


Unnamed: 0,patient_id,lesion_id,smoke,drink,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,...,changed,bleed,elevation,img_id,biopsed,Fold_0,Fold_1,Fold_2,Fold_3,Fold_4
0,PAT_1516,1765,UNK,UNK,8,UNK,UNK,UNK,UNK,UNK,...,False,False,False,PAT_1516_1765_530.png,False,1,0,0,0,0
1,PAT_46,881,False,False,55,False,FEMALE,True,True,True,...,True,True,True,PAT_46_881_939.png,True,1,0,0,0,0
2,PAT_1545,1867,UNK,UNK,77,UNK,UNK,UNK,UNK,UNK,...,False,False,False,PAT_1545_1867_547.png,False,1,0,0,0,0
3,PAT_1989,4061,UNK,UNK,75,UNK,UNK,UNK,UNK,UNK,...,False,False,False,PAT_1989_4061_934.png,False,1,0,0,0,0
4,PAT_684,1302,False,True,79,False,MALE,True,False,False,...,False,True,True,PAT_684_1302_588.png,True,1,0,0,0,0


Ορισμός διάφορων συναρτήσεων για προετοιμασία των εισόδων στο Random Forest.

In [None]:
def frequency_imputation(train_dataset, test_dataset, column):

  value_counts = train_dataset[column].value_counts().to_dict()

  if 'UNK' in value_counts:
    del value_counts['UNK']

  sum = 0
  for i in value_counts.items():
    sum += i[1]

  for i in value_counts.items():
    value_counts[i[0]] = i[1]/sum

  replacement_choices = list(value_counts.keys())
  replacement_probabilities = list(value_counts.values())

  for lesion in train_dataset.lesion_id.unique():
    random_value = random.choices(replacement_choices, replacement_probabilities)[0]
    train_dataset.loc[(train_dataset['lesion_id'] == lesion) & (train_dataset[column] == 'UNK'), column] = random_value

  for lesion in test_dataset.lesion_id.unique():
    random_value = random.choices(replacement_choices, replacement_probabilities)[0]
    test_dataset.loc[(test_dataset['lesion_id'] == lesion) & (test_dataset[column] == 'UNK'), column] = random_value

#υποθέτω ότι κανένα απτα use_features δεν περιέχει UNK τιμές. Οι κατηγορικές τιμές των use_features δεν πρέπει να είναι κωδικοποιημένες.
def predict_missing_cat_values(fit_dataset, use_features ,target_feature, fill_datasets):

  train_df = fit_dataset.loc[fit_dataset[target_feature] != 'UNK'][use_features + [target_feature]].copy(deep = True)

  cat_vars = [cat_f for cat_f in use_features if cat_f in categorical_vars]

  #encode target_feature
  le_target = LabelEncoder()
  le_target.fit(train_df[target_feature])

  train_df[target_feature] = le_target.transform(train_df[target_feature])

  cat_encoders = one_hot_encode_fit(train_df, cat_vars)
  train_df = one_hot_encode_transform(train_df, cat_vars, cat_encoders)

  model = RandomForestClassifier(class_weight='balanced')
  sample_weight = compute_sample_weight('balanced', train_df[target_feature])
  model.fit(train_df.drop(columns = [target_feature]), train_df[target_feature], sample_weight = sample_weight)

  for current_dataset in fill_datasets:

    unknown_indeces = (current_dataset[target_feature] == 'UNK')

    fill_dataset = one_hot_encode_transform(current_dataset.loc[unknown_indeces][use_features], cat_vars, cat_encoders)

    predictions = model.predict(fill_dataset)

    current_dataset.loc[unknown_indeces, [target_feature]] = le_target.inverse_transform(predictions)

def split_numpy_to_cols(dataset, array_encodings, column):

  for cols in range(array_encodings.shape[1]):
      column_data = array_encodings[:, cols]
      dataset[column + '_' + str(cols)] = column_data

  dataset.drop(columns = [column], inplace = True)

def one_hot_encode_fit(fit_dataset, columns):

  cat_encoders = []
  for cat_var in columns:
    unique_values = sorted(fit_dataset[cat_var].unique().tolist())
    cat_encoders.append(unique_values)

  return cat_encoders

def one_hot_encode_transform(transform_dataset, columns, encoders):

  temp_dataset = transform_dataset.copy(deep = True)

  for index in range(len(columns)):

    cat_var = columns[index]
    encoder = encoders[index]

    encoded_values = []
    for i in temp_dataset[cat_var]:

      encoded_value = [0] * len(encoder)
      if i not in encoder:
        encoded_values.append(encoded_value)
        continue

      encoded_value[encoder.index(i)] = 1
      encoded_values.append(np.array(encoded_value))

    array_encodings = np.array(encoded_values)
    split_numpy_to_cols(temp_dataset, array_encodings, cat_var)

  return temp_dataset

def one_hot_encode_fit_transform(train_dataset, test_dataset, columns):

  encoded_train_dataset = train_dataset.copy(deep = True)
  encoded_test_dataset = test_dataset.copy(deep = True)

  for cat_var in columns:

    unique_values = sorted(encoded_train_dataset[cat_var].unique().tolist())

    encoded_values = []
    for i in encoded_train_dataset[cat_var]:

      encoded_value = [0] * len(unique_values)
      encoded_value[unique_values.index(i)] = 1
      encoded_values.append(np.array(encoded_value))

    train_array_encodings = np.array(encoded_values)
    split_numpy_to_cols(encoded_train_dataset, train_array_encodings, cat_var)

    encoded_values = []
    for i in encoded_test_dataset[cat_var]:

      encoded_value = [0] * len(unique_values)
      if i not in unique_values:
        encoded_values.append(np.array(encoded_value))
        continue

      encoded_value[unique_values.index(i)] = 1
      encoded_values.append(np.array(encoded_value))

    test_array_encodings = np.array(encoded_values)
    split_numpy_to_cols(encoded_test_dataset, test_array_encodings, cat_var)

  return (encoded_train_dataset, encoded_test_dataset)

def process_numerical_features(train_dataset,test_dataset, columns):

  processed_train_dataset = train_dataset.copy(deep = True)
  processed_test_dataset = test_dataset.copy(deep = True)

  for j in columns:

    fit_data = np.array(processed_train_dataset.loc[processed_train_dataset[j] != -5][j]).reshape(-1,1)

    scaler = StandardScaler()
    scaler.fit(np.array(fit_data))

    processed_train_dataset.loc[processed_train_dataset[j] == -5, [j]] = scaler.mean_
    processed_test_dataset.loc[processed_test_dataset[j] == -5, [j]] = scaler.mean_

    processed_train_dataset[j] = scaler.transform(np.array(processed_train_dataset[j]).reshape(-1,1))
    processed_test_dataset[j] = scaler.transform(np.array(processed_test_dataset[j]).reshape(-1,1))

  return (processed_train_dataset, processed_test_dataset)

Διαχωρισμός των δεδομένων.

In [None]:
train_dfs = []
test_dfs = []

for i in ['Fold_0','Fold_1','Fold_2','Fold_3','Fold_4']:

  temp_train_dataset = metadata_wdp.loc[metadata_wdp[i] == 0]
  temp_test_dataset = metadata_wdp.loc[metadata_wdp[i] == 1]

  final_train_dataset = metadata[metadata['lesion_id'].isin(temp_train_dataset['lesion_id'])].reset_index(drop = True)
  final_test_dataset = metadata[metadata['lesion_id'].isin(temp_test_dataset['lesion_id'])].reset_index(drop = True)

  print("Fold number: ", i)
  print("Length of final train dataset:", len(final_train_dataset))
  print("Length of final test dataset:", len(final_test_dataset))

  final_train_dataset, final_test_dataset = process_numerical_features(final_train_dataset, final_test_dataset, numerical_vars)

  for j in categorical_vars:
    frequency_imputation(final_train_dataset, final_test_dataset, j)

  #gia predictions:
  #for j in list(set(categorical_vars) - set(['grew', 'changed'])):
  #  frequency_imputation(final_train_dataset, final_test_dataset, j)

  #predict_missing_cat_values(final_train_dataset, ['age', 'bleed', 'itch', 'hurt', 'elevation', 'region'] ,'grew', [final_train_dataset, final_test_dataset])
  #predict_missing_cat_values(final_train_dataset, ['age', 'bleed', 'itch', 'hurt', 'elevation', 'region'] ,'changed', [final_train_dataset, final_test_dataset])

  final_train_dataset.drop(columns = non_features, inplace=True)
  final_test_dataset.drop(columns = non_features, inplace=True)

  train_dfs.append(final_train_dataset)
  test_dfs.append(final_test_dataset)

train_dfs[0].head()

Fold number:  Fold_0
Length of final train dataset: 1751
Length of final test dataset: 547
Fold number:  Fold_1
Length of final train dataset: 1787
Length of final test dataset: 511
Fold number:  Fold_2
Length of final train dataset: 1866
Length of final test dataset: 432
Fold number:  Fold_3
Length of final train dataset: 1859
Length of final test dataset: 439
Fold number:  Fold_4
Length of final train dataset: 1929
Length of final test dataset: 369


Unnamed: 0,smoke,drink,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,has_sewage_system,region,diagnostic,itch,grew,hurt,changed,bleed,elevation
0,False,True,0.3295,True,MALE,False,True,True,False,FACE,1,False,True,False,False,False,True
1,False,True,0.140396,False,MALE,True,True,True,True,CHEST,1,True,False,True,False,True,True
2,False,False,-0.489948,False,FEMALE,False,True,True,True,NECK,1,True,True,False,True,False,True
3,False,False,-0.174776,True,FEMALE,True,True,False,True,FACE,1,False,False,False,False,True,True
4,False,False,1.716257,False,FEMALE,True,True,False,False,EAR,1,True,True,True,False,True,True


In [None]:
from sklearn.ensemble import RandomForestClassifier

def fit_model(model, Xs, Ys, use_sample_weight = True):

  X_train, X_test = Xs
  y_train, y_test = Ys

  sample_weight = None
  if use_sample_weight:
    sample_weight = compute_sample_weight('balanced', y_train)

  model.fit(X_train, y_train, sample_weight = sample_weight)
  y_pred = model.predict(X_test)

  return f1_score(y_test, y_pred, average = 'macro')

def evaluate_with_features(categorical_features, numerical_features):

  f1_scores = []
  for i in range(5):
    temp_model = RandomForestClassifier(n_estimators=500, class_weight='balanced')

    train_dataset = train_dfs[i][categorical_features + numerical_features + target]
    test_dataset = test_dfs[i][categorical_features + numerical_features + target]

    train_dataset, test_dataset = one_hot_encode_fit_transform(train_dataset, test_dataset, categorical_features)

    f1_scores.append(fit_model(temp_model,(train_dataset.drop(columns = target), test_dataset.drop(columns = target)), (train_dataset['diagnostic'], test_dataset['diagnostic'])))

  return (np.array(f1_scores).mean())

Αναλυτική δοκιμή των δεδομένων, ώστε να προσδιοριστεί ο συνδυασμός δεδομένων που θα επιφέρει το μέγιστο αποτέλεσμα.

In [None]:
chosen_features = []

best_score = 0
while True:
  best_feature = None

  for feature in [f for f in total_features if f not in chosen_features]:

    candidate_features = chosen_features + [feature]

    categorical_features = [cat_f for cat_f in candidate_features if cat_f in categorical_vars]
    numerical_features = [num_f for num_f in candidate_features if num_f in numerical_vars]

    f_score = evaluate_with_features(categorical_features, numerical_features)

    if f_score > best_score:
        best_score = f_score
        best_feature = feature

  if best_feature is not None:
      chosen_features.append(best_feature)
      print("Added feature: ", best_feature)
      print("F-score: ", best_score)
  else:
      break

print("Chosen features:", chosen_features)

Added feature:  age
F-score:  0.15088834947766178
Added feature:  region
F-score:  0.3033891457351232
Added feature:  bleed
F-score:  0.3564683811218198
Added feature:  elevation
F-score:  0.3869443068707181
Added feature:  itch
F-score:  0.42992543816912443
Added feature:  changed
F-score:  0.4529247637518747
Added feature:  grew
F-score:  0.4789934280655392
Added feature:  pesticide
F-score:  0.4823386531781173
Added feature:  cancer_history
F-score:  0.4916128477923678
Added feature:  has_sewage_system
F-score:  0.4972431130655693
Chosen features: ['age', 'region', 'bleed', 'elevation', 'itch', 'changed', 'grew', 'pesticide', 'cancer_history', 'has_sewage_system']


In [None]:
evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'has_sewage_system', 'has_piped_water'], ['age'])

0.48204125500827055

In [None]:
evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt'], ['age'])

0.48045096358140105

In [None]:
evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender'], ['age'])

0.4652055362494892

In [None]:
evaluate_with_features(['bleed', 'itch',  'elevation', 'region', 'hurt'], ['age'])

0.43691590537456604

In [None]:
evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history'], ['age'])

0.46731691596819847

In [None]:
evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink'], ['age'])

0.46944483700343864

In [None]:
evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt','has_sewage_system', 'has_piped_water',  'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink', 'pesticide'], ['age'])

0.47531019185031154

ME PREDICTIONS στα grew και changed, χρησιμοποιώντας bleed,itch,hurt,elevation,region, age.

In [None]:
evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'has_sewage_system', 'has_piped_water'], ['age'])

0.4837478728789786

In [None]:
evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt'], ['age'])

0.4762646519654063

In [None]:
evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender'], ['age'])

0.4730438126320092

In [None]:
evaluate_with_features(['bleed', 'itch',  'elevation', 'region', 'hurt'], ['age'])

0.4373973352668935

In [None]:
evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history'], ['age'])

0.4839732017171835

In [None]:
evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink'], ['age'])

0.4715021135260386

In [None]:
evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt','has_sewage_system', 'has_piped_water',  'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink', 'pesticide'], ['age'])

0.49065984531346674

In [None]:
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'has_sewage_system', 'has_piped_water'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender'], ['age']))
print(evaluate_with_features(['bleed', 'itch',  'elevation', 'region', 'hurt'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt', 'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink'], ['age']))
print(evaluate_with_features(['bleed', 'itch', 'grew', 'changed', 'elevation', 'region', 'hurt','has_sewage_system', 'has_piped_water',  'gender', 'skin_cancer_history', 'cancer_history', 'smoke', 'drink', 'pesticide'], ['age']))

0.4775518740775124
0.4887151131020139
0.47232225380019166
0.43849107979777313
0.4660221997359792
0.4605952830377994
0.48774676642210385
