# Importing Libraries

In [1]:
import os

import matplotlib as mpl
import matplotlib.pyplot as plt

import cv2
import pandas as pd
import numpy as np
import mahotas

from PIL import Image,ImageOps

from skimage.feature import hog
from skimage.color import rgb2grey

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC

from sklearn.metrics import roc_curve, auc

# Image and Data Preprocessing and Feature Extraction

## Feature Extraction Functions

In [2]:
def fd_hu_moments(image):
    feature = cv2.HuMoments(cv2.moments(image)).flatten()
    return feature

def fd_haralick(image): 
    # compute the haralick texture feature vector
    haralick = mahotas.features.haralick(image).mean(axis=0)
    return haralick

def create_features(img):
    # flatten three channel color image
    #color_features = img.flatten()
    # convert image to greyscale
    grey_image = rgb2grey(img)
    # get HOG features from greyscale image
    hog_features,hog_imgs = hog(grey_image, orientations=9, pixels_per_cell=(4, 4),cells_per_block=(2, 2),visualize=True)  #got total 8100 features,after checking I wrote this.
    # combine all features into a single array
    hu_features = fd_hu_moments(img)  #Total 7 features
    haralick_features = fd_haralick(img)  #Total 13 features
    #print('no. of features => hog=',len(hog_features),', hu=',len(hu_features),', haralick=',len(haralick_features))
    flat_features = np.hstack([hog_features,hu_features,haralick_features])
    return flat_features

In [3]:
def getImage(path):
    img = Image.open(path)
    img = ImageOps.fit(img,(64,64),Image.ANTIALIAS)
    img = np.array(img)
    return img

def getDF(path): #get DataFrame
    features_list=[]
    for file in os.listdir(path):
        path1 = os.path.join(path+file)
        img = getImage(path1)
        fr = create_features(img)
        features_list.append(fr)
    feature_matrix = np.array(features_list)
    return feature_matrix

### Preparing Training Set

In [4]:
train1 = getDF('.\\train\\NORMAL\\')

In [10]:
train1

array([[ 0.13829613,  0.        ,  0.        , ...,  5.17048539,
        -0.38483317,  0.99837777],
       [ 0.2520088 ,  0.30773162,  0.30773162, ...,  5.35489435,
        -0.36338669,  0.99783129],
       [ 0.3244928 ,  0.03073737,  0.        , ...,  5.09888993,
        -0.35587406,  0.99720542],
       ...,
       [ 0.39171518,  0.19374719,  0.        , ...,  4.69333821,
        -0.34790142,  0.99617359],
       [ 0.12677086,  0.        ,  0.13111985, ...,  4.82680968,
        -0.35097743,  0.99651501],
       [ 0.33712332,  0.        ,  0.        , ...,  4.87480457,
        -0.38654834,  0.99814414]])

In [8]:
train1.shape

(1341, 8120)

In [11]:
pca = PCA(n_components=100)
train1 = pca.fit_transform(train1)

In [12]:
train1.shape

(1341, 100)

In [13]:
ss = StandardScaler()
train1 = ss.fit_transform(train1)

In [14]:
label1 = np.array(['normal']*1341).reshape(-1,1) #1341 is no. of rows in train1
train1 = np.concatenate((train1,label1),axis=1)

In [15]:
train1

array([['0.6864043909191616', '-1.1578634753077777',
        '-0.14978184392678884', ..., '-0.266531406013027',
        '-0.09254903651618077', 'normal'],
       ['0.01234082264659507', '0.8260274065765886',
        '1.4452979015586724', ..., '0.5513439925735877',
        '-0.4151086687538014', 'normal'],
       ['-0.38717981596379225', '-0.48466938275323734',
        '-0.5558592264756236', ..., '-0.543353147694312',
        '-1.3317208106625964', 'normal'],
       ...,
       ['-1.0734563067236438', '-1.7159640591639018',
        '1.7986716440447694', ..., '2.1460549701265577',
        '-0.556784473973163', 'normal'],
       ['-1.01467170397635', '-1.0817555352470756',
        '-0.5945200891874015', ..., '0.2768146149408373',
        '0.7258556862560157', 'normal'],
       ['0.20280017928417038', '-1.9517575114412793',
        '2.605164445736288', ..., '1.0906475586952997',
        '-0.7781712865596657', 'normal']], dtype='<U32')

In [16]:
# Doing same operations to get train2(Pneumonia part)
train2 = getDF('.\\train\\PNEUMONIA\\')
pca = PCA(n_components=100)
train2 = pca.fit_transform(train2)
ss = StandardScaler()
train2 = ss.fit_transform(train2)

In [17]:
train2.shape

(3592, 100)

In [18]:
label2 = np.array(['pneumonia']*3592).reshape(-1,1) #3592 is no. of rows in train2
train2 = np.concatenate((train2,label2),axis=1)

In [19]:
train2.shape

(3592, 101)

In [20]:
train_set = np.concatenate((train1,train2),axis=0)

In [21]:
train_set

array([['0.6864043909191616', '-1.1578634753077777',
        '-0.14978184392678884', ..., '-0.266531406013027',
        '-0.09254903651618077', 'normal'],
       ['0.01234082264659507', '0.8260274065765886',
        '1.4452979015586724', ..., '0.5513439925735877',
        '-0.4151086687538014', 'normal'],
       ['-0.38717981596379225', '-0.48466938275323734',
        '-0.5558592264756236', ..., '-0.543353147694312',
        '-1.3317208106625964', 'normal'],
       ...,
       ['0.26367279199178006', '-1.0559742194290351',
        '-0.18049301706116777', ..., '-1.380185542194111',
        '-0.8371160865995119', 'pneumonia'],
       ['-0.9978129912932912', '0.1965075016450941',
        '-0.8198437032138226', ..., '-0.03522930142488456',
        '0.00882506744500263', 'pneumonia'],
       ['-0.5403224282990343', '-0.6383764105905336',
        '0.4344641212266098', ..., '-0.3596217694403383',
        '-0.12446372531956615', 'pneumonia']], dtype='<U32')

In [22]:
train_set.shape

(4933, 101)

In [23]:
np.random.shuffle(train_set)

In [22]:
train_set

array([['1.2708176814340306', '-1.5906594389338347',
        '1.4577351540280412', ..., '-0.2302765842761266',
        '-0.5913948086706202', 'pneumonia'],
       ['-0.15949459150556328', '0.09522725426458803',
        '-0.7347119625501648', ..., '-0.7486410055835269',
        '0.4263836016780033', 'pneumonia'],
       ['0.6923839973054228', '0.49389224325601266', '2.166188833952168',
        ..., '-2.5904956700061827', '-0.44881524487635954', 'pneumonia'],
       ...,
       ['1.9096313175423167', '0.09336692529687783',
        '0.7537674156873273', ..., '-0.6585973080721648',
        '0.5887410140472499', 'pneumonia'],
       ['-1.9961488242217835', '0.20529015420204283',
        '-0.8913185825217947', ..., '-0.3301031560862101',
        '1.8805190482650114', 'pneumonia'],
       ['0.13956210027994295', '-0.9085046002449325',
        '-0.07117935667350593', ..., '1.6133258891914415',
        '1.7342866553798013', 'pneumonia']], dtype='<U32')

## Preparing test set

In [24]:
test1 = getDF('.\\test\\NORMAL\\')
pca = PCA(n_components=100)
test1 = pca.fit_transform(test1)
ss = StandardScaler()
test1 = ss.fit_transform(test1)

In [25]:
test1.shape

(234, 100)

In [26]:
label1 = np.array(['normal']*234).reshape(-1,1) #234 is no. of rows in test1
test1 = np.concatenate((test1,label1),axis=1)

In [27]:
test1

array([['0.2435974505168652', '-0.20022577281389617',
        '-0.0714450980205829', ..., '0.2715431284374765',
        '1.4142648225207914', 'normal'],
       ['-0.3189901459820519', '0.17915871217475202',
        '-1.3390605474782116', ..., '-0.11387714473892618',
        '-0.10840600806874298', 'normal'],
       ['0.548359998105811', '-0.7729182156077122',
        '-0.1565039326723877', ..., '-0.9888292880158444',
        '0.6602845993509231', 'normal'],
       ...,
       ['-0.5811702389675427', '0.6214823875948488',
        '1.8871507910294323', ..., '0.9571549171394201',
        '-1.0552814941348532', 'normal'],
       ['0.03400204407887488', '-0.48859107325734275',
        '-0.25059911934538437', ..., '0.9115307648919669',
        '0.616068537271108', 'normal'],
       ['-0.17690917421935723', '-0.9404727800001073',
        '-0.022728557846579622', ..., '0.48535151492965567',
        '-0.8314869392686487', 'normal']], dtype='<U32')

In [28]:
test2 = getDF('.\\test\\PNEUMONIA\\')
pca = PCA(n_components=100)
test2 = pca.fit_transform(test2)
ss = StandardScaler()
test2 = ss.fit_transform(test2)

In [29]:
test2.shape

(390, 100)

In [30]:
label2 = np.array(['pneumonia']*390).reshape(-1,1) #390 is no. of rows in test2
test2 = np.concatenate((test2,label2),axis=1)

In [31]:
test2.shape

(390, 101)

In [32]:
test_set = np.concatenate((test1,test2),axis=0)

In [33]:
test_set.shape

(624, 101)

In [34]:
test_set

array([['0.2435974505168652', '-0.20022577281389617',
        '-0.0714450980205829', ..., '0.2715431284374765',
        '1.4142648225207914', 'normal'],
       ['-0.3189901459820519', '0.17915871217475202',
        '-1.3390605474782116', ..., '-0.11387714473892618',
        '-0.10840600806874298', 'normal'],
       ['0.548359998105811', '-0.7729182156077122',
        '-0.1565039326723877', ..., '-0.9888292880158444',
        '0.6602845993509231', 'normal'],
       ...,
       ['-0.36355557355225915', '0.7670313762901496',
        '-1.243197654458368', ..., '-1.696556189162131',
        '1.072775091836771', 'pneumonia'],
       ['0.36613780798917467', '0.28965492998255554',
        '-0.09134331678693175', ..., '-0.46568585432133086',
        '0.8260306197907766', 'pneumonia'],
       ['1.5024863252450076', '0.5041729270324699',
        '-0.6276581888932901', ..., '0.5750047743286817',
        '-0.3523680732404671', 'pneumonia']], dtype='<U32')

In [35]:
np.random.shuffle(test_set)  #so as to mix data

In [36]:
test_set

array([['-0.472328445619833', '1.0856749104255496',
        '-0.7498223394349696', ..., '-0.18517366587103312',
        '-0.04209246187223929', 'normal'],
       ['-1.6112131578610644', '0.13864809854227939',
        '-0.30914738059180874', ..., '1.0343751798653746',
        '0.07364042899342974', 'normal'],
       ['-1.3952673776523947', '0.49855169717188313',
        '-0.49942465842019046', ..., '0.9189371578179882',
        '-0.8814727071071546', 'pneumonia'],
       ...,
       ['0.5238222497771975', '-1.5117627302426342',
        '0.4337867584050859', ..., '0.6660620677972664',
        '1.675814387586313', 'pneumonia'],
       ['-0.833359176399394', '0.5555129678149998',
        '-1.4567653884719358', ..., '-0.11672203664793739',
        '0.7853140579003111', 'pneumonia'],
       ['1.3380108104140156', '-0.57214533414723', '-1.0827398370118677',
        ..., '0.1924176694792265', '-0.5225341168863505', 'pneumonia']],
      dtype='<U32')

### Now Our train_set and test_set is ready 

### Creating DataFrames from Numpy Arrays and then saving to CSV files

In [37]:
train = pd.DataFrame(train_set)
test = pd.DataFrame(test_set)

In [3]:
train.to_csv('train_set_new(1).csv')
test.to_csv('test_set_new(1).csv')

NameError: name 'train' is not defined

In [2]:
train = pd.read_csv('train_set_new(1).csv')
test = pd.read_csv('test_set_new(1).csv')

In [3]:
train = train.iloc[:,1:]

In [4]:
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.157267,-0.575144,0.126688,0.676862,0.126461,-1.919849,1.732439,0.616582,0.329325,-1.149372,...,-0.993324,-1.630420,0.023476,-1.539898,0.887128,-0.539008,0.977079,0.737772,-0.481511,pneumonia
1,-0.835722,-1.731349,-0.283213,-0.326797,2.225978,0.519543,-0.059671,-0.951832,1.051133,-1.542391,...,1.428640,0.214924,0.236727,0.735657,0.793324,-0.947705,-0.043214,-0.060891,0.452389,normal
2,-0.310200,0.785516,-1.289500,-1.089452,-0.128506,-0.194918,1.717981,-0.748433,-1.279298,1.702871,...,-1.034225,-0.208928,0.466460,-0.733281,0.293464,0.015909,0.129915,0.608230,1.003302,normal
3,0.386720,-0.559531,-0.528551,0.999788,2.079317,-0.937689,-0.492586,-0.477585,0.900361,1.667053,...,-0.006583,-0.399218,-1.202125,2.080307,-0.423147,1.647414,1.149927,-2.428944,1.754039,pneumonia
4,-1.012918,0.239050,-1.014667,0.743340,-0.616652,0.096820,0.443685,2.675547,0.525062,0.833926,...,1.767198,-0.783279,0.407633,-0.463602,0.609334,1.119718,0.662544,-1.078070,0.393151,pneumonia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4928,-0.622092,-0.329003,3.225326,-0.141524,0.659889,-1.817977,-0.276274,0.786053,-0.207272,-0.526336,...,0.471102,-0.687190,0.304304,-1.085896,-0.562243,1.764842,-0.489863,1.110622,0.586836,pneumonia
4929,-1.051548,1.173304,-0.002965,-0.317289,-0.317248,1.985175,1.022249,0.492459,1.223128,-0.629469,...,-0.730875,0.551183,-0.334284,0.269307,-0.335257,-0.682103,-0.890917,-0.342231,-0.572316,normal
4930,-0.210589,-0.680159,2.024196,-0.037799,0.135143,-0.951916,-0.602355,2.658844,1.094505,0.115664,...,-0.950461,-0.946428,-1.597650,0.554678,0.132048,-2.051183,-0.580766,1.365550,-0.149296,pneumonia
4931,2.300920,1.696866,-0.036065,-1.154284,0.639921,0.856795,-1.310928,-0.703027,0.269764,0.554876,...,-0.549636,-2.401481,0.463962,-1.656400,0.208008,-0.944048,0.888724,-0.192781,0.307239,pneumonia


In [5]:
test = test.iloc[:,1:]

In [6]:
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,-0.472328,1.085675,-0.749822,-0.285467,-0.660540,1.155998,1.479402,2.153203,-0.736417,-0.575742,...,-0.628115,2.765040,0.388121,0.017820,0.564266,3.588024,1.254184,-0.185174,-0.042092,normal
1,-1.611213,0.138648,-0.309147,0.434265,1.272825,0.918080,-0.860235,0.470946,1.993507,0.197789,...,0.667431,-0.280414,0.718071,-0.732875,0.002123,-0.209649,-0.569940,1.034375,0.073640,normal
2,-1.395267,0.498552,-0.499425,1.321080,-0.046659,0.603096,-0.752792,1.658666,-0.347820,-1.254308,...,0.078020,-1.768159,0.383000,-0.561510,2.162461,2.458636,-1.734497,0.918937,-0.881473,pneumonia
3,-0.182810,-0.466491,0.333916,0.151990,-1.288084,0.667536,-0.614238,1.065337,0.093364,1.306338,...,-0.727984,-0.180255,-0.043039,0.169466,0.770236,-0.267623,-0.924222,0.719277,1.590103,normal
4,0.076269,-0.495397,-0.631994,-1.498722,0.931141,0.529889,-0.785312,1.255342,-1.256478,0.997527,...,1.373401,0.131782,0.888906,-2.067776,-2.914500,-0.991065,0.757489,-0.810601,0.720773,pneumonia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,0.036397,0.782307,-0.311328,2.275654,0.743151,-0.169907,0.641820,-0.393318,0.108730,1.328702,...,-1.012574,0.641899,0.388221,-0.804695,0.311737,-1.939007,-1.608960,2.280191,0.985555,pneumonia
620,-1.285837,-0.097869,-1.654277,2.415355,0.001143,0.723349,-2.388542,0.636638,0.491117,-0.975233,...,-1.673415,0.999384,0.462888,0.667374,-1.094062,0.864363,1.657094,1.269903,-1.383065,pneumonia
621,0.523822,-1.511763,0.433787,0.211104,-0.089689,-0.522175,1.711209,0.459471,-1.870358,-0.399379,...,0.513650,-0.053010,0.355692,-0.236023,-0.504184,-0.143812,-0.518827,0.666062,1.675814,pneumonia
622,-0.833359,0.555513,-1.456765,-0.736341,-0.226297,2.264432,-0.908161,0.425276,-0.458456,1.163734,...,-2.434992,-1.342190,0.923463,0.623303,-0.519073,0.206085,0.377749,-0.116722,0.785314,pneumonia


In [7]:
X_train = train.iloc[:, :-1].values
Y_train = train.iloc[:,-1].values
X_test = test.iloc[:, :-1].values
Y_test = test.iloc[:,-1].values

# Training a Kernel SVM model

In [134]:
classifier = SVC(C=2,kernel = 'rbf',verbose=True)
classifier.fit(X_train, Y_train)

[LibSVM]

SVC(C=2, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

## Making prediction on test set

In [135]:
Y_pred = classifier.predict(X_test)
Y_predTrain = classifier.predict(X_train)

## Evaluating Results

In [136]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_train, Y_predTrain)  #Training Accuaracy

0.9499290492600851

In [137]:
accuracy_score(Y_test, Y_pred)  #Testing Accuracy

0.625

In [138]:
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(Y_test, Y_pred))

Unnamed: 0,0,1
0,0,234
1,0,390


In [139]:
pd.DataFrame(confusion_matrix(Y_train, Y_predTrain))

Unnamed: 0,0,1
0,1094,247
1,0,3592


# Training a linear SVM model

In [140]:
classifier = SVC(C=2,kernel = 'linear')
classifier.fit(X_train, Y_train)

SVC(C=2, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## Making prediction on test set

In [141]:
Y_pred = classifier.predict(X_test)
Y_predTrain = classifier.predict(X_train)

## Evaluating Results

In [142]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_train, Y_predTrain)  #Training Accuaracy

0.7281573079262113

In [143]:
accuracy_score(Y_test, Y_pred)  #Testing Accuracy

0.625

In [145]:
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(Y_test, Y_pred))

Unnamed: 0,0,1
0,0,234
1,0,390


In [146]:
pd.DataFrame(confusion_matrix(Y_train, Y_predTrain))

Unnamed: 0,0,1
0,0,1341
1,0,3592


# Training a KNN model

In [147]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## Making prediction on test set

In [148]:
Y_pred = classifier.predict(X_test)
Y_predTrain = classifier.predict(X_train)

## Evaluating Results

In [149]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_train, Y_predTrain)  #Training Accuaracy

0.9172917088992499

In [150]:
accuracy_score(Y_test, Y_pred)  #Testing Accuracy

0.5801282051282052

In [151]:
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(Y_test, Y_pred))

Unnamed: 0,0,1
0,35,199
1,63,327


In [152]:
pd.DataFrame(confusion_matrix(Y_train, Y_predTrain))

Unnamed: 0,0,1
0,1006,335
1,73,3519


# Training a Random Forest Model

In [28]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 7, criterion = 'entropy',verbose=1)
classifier.fit(X_train,Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.5s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=7,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=1, warm_start=False)

## Making prediction on test set

In [29]:
Y_pred = classifier.predict(X_test)
Y_predTrain = classifier.predict(X_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s finished


## Evaluating Results

In [30]:
accuracy_score(Y_train, Y_predTrain)

0.9777011960267585

In [31]:
accuracy_score(Y_test, Y_pred)

0.6137820512820513

In [32]:
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(Y_test, Y_pred))

Unnamed: 0,0,1
0,30,204
1,37,353


In [33]:
pd.DataFrame(confusion_matrix(Y_train, Y_predTrain))

Unnamed: 0,0,1
0,1248,93
1,17,3575
