In [1]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [41]:
import os
import numpy as np
from tqdm import tqdm, tqdm_notebook

import cv2
from skimage import io
from skimage.color import rgb2gray
from skimage import exposure as ep

from skimage import feature as ft
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
train_dir = 'data/train/'
validation_dir = 'data/validation/'
test_dir = 'data/test/'


output_classes = 2
batch_size = 32 
img_height, img_width = 256, 256

# nb_train_samples = 1188
# nb_validation_samples = 144
# nb_test_samples = 144

In [4]:
def get_data(folder):
    """
    Load the data and labels from the given folder.
    """
    X = []
    y = []
    for folderName in os.listdir(folder):
        if not folderName.startswith('.'):
            if folderName in ['normal']:
                label = 0
            elif folderName in ['malignant']:
                label = 1
#             else:
#                 label = 3
            for image_filename in tqdm_notebook(os.listdir(folder + folderName)):
                img_file = cv2.imread(folder + folderName + '/' + image_filename)
                if img_file is not None:
#                     img_file = skimage.transform.resize(img_file, (imageSize, imageSize, 3))
                    img_arr = np.asarray(img_file)
                    X.append(img_arr)
                    y.append(label)
    X = np.asarray(X)
    y = np.asarray(y)
    return X,y

In [5]:
X_train, y_train = get_data(train_dir)

HBox(children=(IntProgress(value=0, max=40424), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18744), HTML(value='')))




In [6]:
X_validation, y_validation = get_data(validation_dir)

HBox(children=(IntProgress(value=0, max=12512), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5872), HTML(value='')))




In [7]:
X_test, y_test = get_data(test_dir)

HBox(children=(IntProgress(value=0, max=10480), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4992), HTML(value='')))




In [8]:
from skimage import feature
import numpy as np

class LocalBinaryPatterns:
    def __init__(self, numPoints, radius):
        # store the number of points and radius
        self.numPoints = numPoints
        self.radius = radius

    def describe(self, image, eps=1e-7):
        # compute the Local Binary Pattern representation
        # of the image, and then use the LBP representation
        # to build the histogram of patterns
        lbp = feature.local_binary_pattern(image, self.numPoints,
            self.radius, method="uniform")
        (hist, _) = np.histogram(lbp.ravel(),
            bins=np.arange(0, self.numPoints + 3),
            range=(0, self.numPoints + 2))

        # normalize the histogram
        hist = hist.astype("float")
        hist /= (hist.sum() + eps)

        # return the histogram of Local Binary Patterns
        return hist

# settings for LBP
radius = 3
n_points = 8 * radius
desc = LocalBinaryPatterns(24, 8)

In [9]:
trainSetList = []
validationSetList = []
testSetList = []

In [10]:
for imgs in X_train:
    gray = cv2.cvtColor(imgs, cv2.COLOR_BGR2GRAY)
    hist = desc.describe(gray)
    trainSetList.append(hist.ravel())
    pass

In [11]:
for imgs in X_validation:
    gray = cv2.cvtColor(imgs, cv2.COLOR_BGR2GRAY)
    hist = desc.describe(gray)
    validationSetList.append(hist.ravel())
    pass

In [12]:
for imgs in X_test:
    gray = cv2.cvtColor(imgs, cv2.COLOR_BGR2GRAY)
    hist = desc.describe(gray)
    testSetList.append(hist.ravel())
    pass

trainSet = np.array(trainSetList, np.float32)
validationSet = np.array(validationSetList, np.float32)
testSet = np.array(testSetList, np.float32)

In [None]:
# from keras.utils.np_utils import to_categorical
# y_trainHot = to_categorical(y_train, num_classes = 2)
# y_valHot = to_categorical(y_validation, num_classes = 2)
# y_testHot = to_categorical(y_test, num_classes = 2)

In [19]:
trainSet.shape

(59168, 26)

In [20]:
validationSet.shape

(18384, 26)

In [21]:
testSet.shape

(15472, 26)

In [None]:
y_train.shape

In [None]:
# y_trainHot.shape

In [None]:
y_test.shape

In [None]:
y_test

In [22]:
import sklearn.svm as svm
import sklearn.neighbors as knn

KNN_lbp = knn.KNeighborsClassifier(n_neighbors=1)
KNN_lbp.fit(trainSet, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [23]:
predictions_lbp_knn = KNN_lbp.predict(testSet)

In [24]:
predictions_lbp_knn

array([1, 1, 1, ..., 0, 1, 1])

In [25]:
print("KNN.score:",KNN_lbp.score(testSet, y_test))

KNN.score: 0.7810237849017581


In [26]:
ada_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=500)
ada_clf.fit(trainSet, y_train)
print("ada_clf.score:",ada_clf.score(testSet, y_test))

ada_clf.score: 0.8300155118924509


In [27]:
gb_clf = GradientBoostingClassifier(max_depth=3, n_estimators=500)
gb_clf.fit(trainSet, y_train)
print("gb_clf.score:",gb_clf.score(testSet, y_test))

gb_clf.score: 0.8430067218200621


___

In [30]:
model=svm.SVC(kernel='rbf',C=1,gamma=0.1)
model.fit(trainSet, y_train)
prediction1=model.predict(testSet)
print('Accuracy for rbf SVM is ',metrics.accuracy_score(prediction1,y_test))

Accuracy for rbf SVM is  0.8097854188210962


In [31]:
# Linear Support Vector Machine(linear-SVM)
model=svm.SVC(kernel='linear',C=0.1,gamma=0.1)
model.fit(trainSet, y_train)
prediction2=model.predict(testSet)
print('Accuracy for linear SVM is',metrics.accuracy_score(prediction2,y_test))

Accuracy for linear SVM is 0.8094622543950362


In [32]:
# Logistic Regression
model = LogisticRegression()
model.fit(trainSet, y_train)
prediction3=model.predict(testSet)
print('The accuracy of the Logistic Regression is',metrics.accuracy_score(prediction3,y_test))

The accuracy of the Logistic Regression is 0.8151499482936918


In [33]:
# Decision Tree
model=DecisionTreeClassifier()
model.fit(trainSet, y_train)
prediction4=model.predict(testSet)
print('The accuracy of the Decision Tree is',metrics.accuracy_score(prediction4,y_test))

The accuracy of the Decision Tree is 0.7774689762150983


In [34]:
# kNN
model=KNeighborsClassifier(n_neighbors=8) 
model.fit(trainSet, y_train)
prediction5=model.predict(testSet)
print('The accuracy of the KNN is',metrics.accuracy_score(prediction5,y_test))

The accuracy of the KNN is 0.8189632885211996


In [36]:
# Gaussian Naive Bayes
model=GaussianNB()
model.fit(trainSet, y_train)
prediction6=model.predict(testSet)
print('The accuracy of the NaiveBayes is',metrics.accuracy_score(prediction6,y_test))

The accuracy of the NaiveBayes is 0.7117373319544984


In [39]:
# Random Forest
model=RandomForestClassifier(n_estimators=100)
model.fit(trainSet, y_train)
prediction7=model.predict(testSet)
print('The accuracy of the Random Forests is',metrics.accuracy_score(prediction7,y_test))

The accuracy of the Random Forests is 0.8453981385729059


In [44]:
# Voting Classifier
ensemble_lin_rbf=VotingClassifier(estimators=[('KNN',KNeighborsClassifier(n_neighbors=10)),
                                              ('RBF',svm.SVC(probability=True,kernel='rbf',C=0.5,gamma=0.1)),
                                              ('RFor',RandomForestClassifier(n_estimators=500,random_state=0)),
                                              ('LR',LogisticRegression(C=0.05)),
                                              ('DT',DecisionTreeClassifier(random_state=0)),
                                              ('NB',GaussianNB()),
                                              ('svm',svm.SVC(kernel='linear',probability=True))
                                             ], 
                       voting='soft').fit(trainSet, y_train)
print('The accuracy for ensembled model is:',ensemble_lin_rbf.score(testSet,y_test))
cross=cross_val_score(ensemble_lin_rbf, testSet,y_test, cv = 10,scoring = "accuracy")
print('The cross validated score is',cross.mean())

The accuracy for ensembled model is: 0.8341520165460186
The cross validated score is 0.8916740578163287


___

In [48]:
# Cross Validation
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
kfold = KFold(n_splits=10, random_state=22) # k=10, split the data into 10 equal parts
xyz=[]
accuracy=[]
std=[]
classifiers=['Linear Svm','Radial Svm','Logistic Regression','KNN','Decision Tree','Naive Bayes','Random Forest', 'Gradient Boosting', 'Adaboost', 'XGBoost']
models=[svm.SVC(kernel='linear'),
        svm.SVC(kernel='rbf'),
        LogisticRegression(),
        KNeighborsClassifier(n_neighbors=9),
        DecisionTreeClassifier(),
        GaussianNB(),
        RandomForestClassifier(n_estimators=100),
        GradientBoostingClassifier(n_estimators=500,random_state=0,learning_rate=0.1),
        AdaBoostClassifier(n_estimators=200,random_state=0,learning_rate=0.1),
        xgb.XGBClassifier(n_estimators=900,learning_rate=0.1)]
for i in models:
    model = i
    cv_result = cross_val_score(model, testSet,y_test, cv = kfold ,scoring = "accuracy")
    cv_result=cv_result
    xyz.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)
new_models_dataframe2=pd.DataFrame({'CV Mean':xyz,'Std':std},index=classifiers)       
print(new_models_dataframe2)

                      CV Mean       Std
Linear Svm           0.755024  0.189672
Radial Svm           0.676018  0.447488
Logistic Regression  0.742869  0.234000
KNN                  0.861029  0.106609
Decision Tree        0.889408  0.052873
Naive Bayes          0.711484  0.045830
Random Forest        0.943247  0.062141
Gradient Boosting    0.903753  0.088194
Adaboost             0.823665  0.171799
XGBoost              0.928316  0.063509
