In [82]:
import numpy as np
import pandas as pd

from glob import glob

from sklearn.ensemble import RandomForestClassifier as RF
from catboost import CatBoostClassifier as cat
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, matthews_corrcoef as MCC

import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
data_path = "/home/samsmu/Data/Lung/CT"

### 0. Loss functions

In [12]:
def logloss(actual, predicted, eps=1e-14):

    """
    :param actual:      The binary labels. Either 0 or 1.  
    :param predicted:   The predicted probabilities as fl oats between 0-1
    :param eps:         Log(0) is equal to infinity, so we need to offset our predicted values slightly by eps from 0 or 1
    :return:            The logarithmic loss between between the predicted probability assigned to the possible outcomes for item i, and the actual outcome.
    """

    predicted = np.clip(predicted, eps, 1-eps)
    loss = -1 * np.mean(actual * np.log(predicted) + (1 - actual) * np.log(1-predicted))

    return loss

### 1. Classify full series

In [20]:
X = np.load("data_series_X.npy")
Y = np.load("data_series_Y.npy")

kf = StratifiedKFold(n_splits=5)
y_pred = Y * 0
for train, test in kf.split(X, Y):
    X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
    clf = RF(n_estimators=100, n_jobs=3)
    clf.fit(X_train, y_train)
    y_pred[test] = clf.predict(X_test)
    
print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))

# All Cancer
print("Predicting all positive")
y_pred = np.ones(Y.shape)
print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))

# No Cancer
print("Predicting all negative")
y_pred = Y*0
print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))

              precision    recall  f1-score   support

   No Cancer       0.74      0.94      0.83      1176
      Cancer       0.22      0.04      0.07       418

    accuracy                           0.71      1594
   macro avg       0.48      0.49      0.45      1594
weighted avg       0.60      0.71      0.63      1594

logloss 9.403940357104274
Predicting all positive
              precision    recall  f1-score   support

   No Cancer       0.00      0.00      0.00      1176
      Cancer       0.26      1.00      0.42       418

    accuracy                           0.26      1594
   macro avg       0.13      0.50      0.21      1594
weighted avg       0.07      0.26      0.11      1594

logloss 23.78337597091084
Predicting all negative
              precision    recall  f1-score   support

   No Cancer       0.74      1.00      0.85      1176
      Cancer       0.00      0.00      0.00       418

    accuracy                           0.74      1594
   macro avg       0.37     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
X = np.load("data_series_X.npy")
Y = np.load("data_series_Y.npy")

# try catboost
print ("catboost")
kf = KFold(n_splits=3)
Y = Y.astype(int)
y_pred = Y * 0
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
    clf = cat(num_trees = 2, verbose = False)
    clf.fit(X_train, y_train)
    y_pred[test] = clf.predict(X_test)
print (classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))


catboost
              precision    recall  f1-score   support

   No Cancer       0.74      1.00      0.85      1176
      Cancer       0.00      0.00      0.00       418

    accuracy                           0.74      1594
   macro avg       0.37      0.50      0.42      1594
weighted avg       0.54      0.74      0.63      1594

logloss 8.473629206462054


#### 1.1 Final Classification

In [50]:
clf = RF(n_estimators=100, max_depth =14,  n_jobs=3)
clf.fit(X, Y)

y_pred = clf.predict(X)

print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))
print("Random Forest:", MCC(Y, y_pred))

              precision    recall  f1-score   support

   No Cancer       0.94      1.00      0.97      1176
      Cancer       1.00      0.83      0.91       418

    accuracy                           0.96      1594
   macro avg       0.97      0.92      0.94      1594
weighted avg       0.96      0.96      0.95      1594

logloss 1.4358654845897718
Random Forest: 0.8848039049091856


In [55]:
clf = cat(max_depth =9, verbose = False)
clf.fit(X, Y)

y_pred = clf.predict(X)

print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))
print("Catboost:", MCC(Y, y_pred))

              precision    recall  f1-score   support

   No Cancer       0.94      1.00      0.97      1176
      Cancer       1.00      0.83      0.91       418

    accuracy                           0.95      1594
   macro avg       0.97      0.91      0.94      1594
weighted avg       0.96      0.95      0.95      1594

logloss 1.4560889421192051
Catboost: 0.8831740029405325


### 2. Classify nodules

In [59]:
X = np.load("data_nodules_X.npy")
Y = np.load("data_nodules_Y.npy")

In [None]:
kf = KFold(n_splits=3)
y_pred = Y * 0
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
    clf = RF(n_estimators=100, n_jobs=3)
    clf.fit(X_train, y_train)
    y_pred[test] = clf.predict(X_test)
print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))

# All Cancer
print("Predicting all positive")
y_pred = np.ones(Y.shape)
print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))

# No Cancer
print("Predicting all negative")
y_pred = Y*0
print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))

In [56]:
#### 2.1 Final classification

In [100]:
%%time
clf = RF(n_estimators=100,  n_jobs=-1)
clf.fit(X, Y)

y_pred = clf.predict(X)

print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))
print("Random Forest:", MCC(Y, y_pred))

              precision    recall  f1-score   support

   No Cancer       0.78      0.99      0.88   5548422
      Cancer       0.81      0.09      0.17   1672997

    accuracy                           0.78   7221419
   macro avg       0.80      0.54      0.52   7221419
weighted avg       0.79      0.78      0.71   7221419

logloss 6.932271229014301
Random Forest: 0.22758831532331514
CPU times: user 1h 7min 38s, sys: 25.4 s, total: 1h 8min 3s
Wall time: 8min 52s


In [101]:
y_prob = clf.predict_proba(X)

In [122]:
y_tresh = y_prob[:, 1] > 0.4
print("Random Forest treshold:", MCC(Y, y_tresh))
y_tresh.mean()

Random Forest treshold: 0.2243099570646304


0.05070707017554306

In [68]:
%%time
clf = cat(iterations = 20, learning_rate = 0.1, max_depth =9,  verbose = True)
clf.fit(X, Y)

y_pred = clf.predict(X)

print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))
print("Catboost:", MCC(Y, y_pred))

0:	learn: 0.6654591	total: 567ms	remaining: 10.8s
1:	learn: 0.6429514	total: 1.18s	remaining: 10.7s
2:	learn: 0.6245880	total: 1.78s	remaining: 10.1s
3:	learn: 0.6094736	total: 2.42s	remaining: 9.69s
4:	learn: 0.5970176	total: 3.04s	remaining: 9.11s
5:	learn: 0.5868222	total: 3.63s	remaining: 8.48s
6:	learn: 0.5783541	total: 4.27s	remaining: 7.92s
7:	learn: 0.5713585	total: 4.86s	remaining: 7.28s
8:	learn: 0.5655641	total: 5.49s	remaining: 6.71s
9:	learn: 0.5608139	total: 6.13s	remaining: 6.13s
10:	learn: 0.5568654	total: 6.73s	remaining: 5.51s
11:	learn: 0.5535914	total: 7.38s	remaining: 4.92s
12:	learn: 0.5508881	total: 8s	remaining: 4.31s
13:	learn: 0.5486744	total: 8.6s	remaining: 3.68s
14:	learn: 0.5468474	total: 9.2s	remaining: 3.07s
15:	learn: 0.5453401	total: 9.8s	remaining: 2.45s
16:	learn: 0.5440816	total: 10.4s	remaining: 1.84s
17:	learn: 0.5430595	total: 11s	remaining: 1.23s
18:	learn: 0.5422194	total: 11.7s	remaining: 614ms
19:	learn: 0.5415155	total: 12.3s	remaining: 0us


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

   No Cancer       0.77      1.00      0.87   5548422
      Cancer       0.00      0.00      0.00   1672997

    accuracy                           0.77   7221419
   macro avg       0.38      0.50      0.43   7221419
weighted avg       0.59      0.77      0.67   7221419

logloss 7.468206918824775
Catboost: 0.0
CPU times: user 2min 16s, sys: 790 ms, total: 2min 17s
Wall time: 53.7 s


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
