### Setup

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd /content/gdrive/My Drive/Cornell/2020-2021/Biomedical ML Final Project - Melted Paper/Datasets/archive/all-mias

/content/gdrive/.shortcut-targets-by-id/13OShW0589KdYJRWaN9GrhtF2vtwdrUUW/Biomedical ML Final Project - Melted Paper/Datasets/archive/all-mias


# Baseline Standard ML Classifiers
### k-NN, SVM, Random Forest, Logistic Regression, MIAS

#### Importing Necessary Libraries

In [4]:
import numpy as np
import cv2, os, sys, random, pickle, h5py
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

##### Importing Labels and Images

*Info.txt* lists the films in the MIAS database and provides specific information about the images. 
- 1st col: MIAS database reference number
- 2nd col: Character of background tissue
```
F - Fatty 
G - Fatty-glandular
D - Dense-glandular
```
- 3rd col: Class of abnormality present
```
CALC - Calcification
CIRC - Well-defined/circumscribed masses
SPIC - Spiculated masses
MISC - Other, ill-defined masses
ARCH - Architectural distortion
ASYM - Asymmetry
NORM - Normal
```
- 4th col: Severity of abnormality
```
B - Benign
M - Malignant
N - Normal
```
- 5th, 6th col: (x, y) image coordinates of center of abnormality
- 7th col: Approximate radius (in pixels) of a circle enclosing the abnormality
          

          

In [5]:
df = pd.read_table('Info.txt', delimiter=' ')
df.SEVERITY = df.SEVERITY.fillna('N')
df = df[df.columns[:-1]]
df.head()

Unnamed: 0,REFNUM,BG,CLASS,SEVERITY,X,Y,RADIUS
0,mdb001,G,CIRC,B,535.0,425.0,197.0
1,mdb002,G,CIRC,B,522.0,280.0,69.0
2,mdb003,D,NORM,N,,,
3,mdb004,D,NORM,N,,,
4,mdb005,F,CIRC,B,477.0,133.0,30.0


In [6]:
# visualizing different classifications
df_grouped = df.groupby(['CLASS','SEVERITY'])[['REFNUM']].count()
df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,REFNUM
CLASS,SEVERITY,Unnamed: 2_level_1
ARCH,B,9
ARCH,M,10
ASYM,B,6
ASYM,M,9
CALC,B,15
CALC,M,15
CIRC,B,21
CIRC,M,4
MISC,B,7
MISC,M,8


*LabelEncoder* can be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels. 

In [7]:
le = LabelEncoder()
for col in ['BG', 'CLASS', 'SEVERITY']: 
  df[col] = le.fit_transform(df[col])
df['RADIUS'] = df['RADIUS'].fillna(-0)
df['X'] = df['X'].fillna(-1)
df['Y'] = df['Y'].fillna(-1)
df.head()

Unnamed: 0,REFNUM,BG,CLASS,SEVERITY,X,Y,RADIUS
0,mdb001,2,3,0,535.0,425.0,197.0
1,mdb002,2,3,0,522.0,280.0,69.0
2,mdb003,0,5,2,-1.0,-1.0,0.0
3,mdb004,0,5,2,-1.0,-1.0,0.0
4,mdb005,1,3,0,477.0,133.0,30.0


In [8]:
# Extracting Features 
X = df.drop(columns=['REFNUM','SEVERITY'])
X.head()

Unnamed: 0,BG,CLASS,X,Y,RADIUS
0,2,3,535.0,425.0,197.0
1,2,3,522.0,280.0,69.0
2,0,5,-1.0,-1.0,0.0
3,0,5,-1.0,-1.0,0.0
4,1,3,477.0,133.0,30.0


In [9]:
# Creating target values
y = df['SEVERITY'].values
print(y[0:5])

[0 0 2 2 0]


In [10]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'B': 0, 'M': 1, 'N': 2}


In [11]:
#split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

K Nearest Neighbors (referenced https://towardsdatascience.com/building-a-k-nearest-neighbors-k-nn-model-with-scikit-learn-51209555453a)

In [12]:
from sklearn.neighbors import KNeighborsClassifier
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 3)
# Fit the classifier to the data
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [13]:
#show first 5 model predictions on the test data
knn.predict(X_test)[0:5]

array([2, 2, 0, 0, 2])

In [15]:
#check accuracy of our model on the test data
print('k-NN accuracy: {}'.format(knn.score(X_test, y_test)))

k-NN accuracy: 0.8939393939393939


Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 0)
lr.fit(X_train, y_train)
lr.predict(X_test)[0:5]
print('Logistic regression accuracy: {}'.format(lr.score(X_test,y_test)))

Logistic regression accuracy: 0.8484848484848485


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Support Vector Machine

In [20]:
from sklearn.svm import SVC
svc = SVC(kernel='linear', random_state=0)
svc.fit(X_train, y_train)
svc.predict(X_test)[0:5]
print('SVM accuracy: {}'.format(svc.score(X_test,y_test)))

SVM accuracy: 0.803030303030303


Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state =0)
rf.fit(X_train, y_train)
rf.predict(X_test)[0:5]
print('Random forest accuracy: {}'.format(rf.score(X_test,y_test)))

Random forest accuracy: 0.803030303030303
