This notebook is to check images by labels.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import itertools
from sklearn.utils import shuffle
import os
import json
from keras import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras import backend as K
from keras.layers import Conv2D, MaxPool2D, Dropout, Dense, Flatten
%matplotlib inline 
np.set_printoptions(suppress=True)

In [None]:
#Utility method to read the data pickle
def read_dataset(filename):
    return pd.read_pickle(filename)

In [None]:
TEST_PICKLE = '../data/pickles/test.pickle'
TRAIN_PICKLE = '../data/pickles/train.pickle'
VALIDATION_PICKLE = '../data/pickles/validation.pickle'
TRAIN_LABEL_PICKLE = '../data/pickles/train_label.pickle'
VALIDATION_LABEL_PICKLE = '../data/pickles/validation_label.pickle'
PATH_TO_IMAGES = 'G:\\Data\\data\\train\\'

LABEL_ID = 50

In [None]:
dataset = read_dataset(VALIDATION_LABEL_PICKLE)
dataset = pd.DataFrame(dataset, dtype='int32')

In [None]:
number_of_labels = dataset['labelId'].nunique() # Number of distinct labels
maximum_label_id = max(dataset['labelId']) # The maximum labelId value
number_of_images = dataset['imageId'].nunique() # Number of distinct images
print('Number of distinct labels in the dataset : ', number_of_labels)
print('Maximum id if labels in the dataset : ', maximum_label_id)
print('Number of distinct images in the dataset : ', number_of_images)

In [None]:
#now let us a define a function for make undersample data with different proportion
#different proportion means with different proportion of normal classes of data
def undersample(unlabelled_data,labelled_data, times):#times denote the normal data = times*fraud data
    count_labelled = len(labelled_data)
    healthy_sample = unlabelled_data.sample(times*count_labelled, replace=False)
    healthy_sample.describe()
    total_count = len(healthy_sample) + count_labelled
    print("Number of unlabelled rows :",len(healthy_sample))
    print("Number of labelled rows :",count_labelled)
    print("total number of record in resampled data is:",total_count)
    return healthy_sample

In [None]:
# for undersampling we need a portion of majority class and will take whole data of minority class
# count fraud transaction is the total number of fraud transaction
# now lets us see the index of fraud cases
labelled_data= dataset[dataset['labelId'] == LABEL_ID]
labelled_data.reset_index()
unlabelled_data= dataset[(dataset['labelId'] != LABEL_ID )
                         &( ~dataset['imageId'].isin(labelled_data['imageId']))]
unlabelled_data.reset_index()
undersampled_data = undersample(unlabelled_data,labelled_data, 1)
train_images = pd.concat([labelled_data, undersampled_data])

In [None]:
train_images['isLabel'] = train_images['labelId'] == LABEL_ID
train_images = train_images[['imageId', 'isLabel']]

In [None]:
train_images.head()
shuffle(train_images)

In [None]:
def f1_score(y_true, y_pred):
    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0
    # How many selected items are relevant?
    precision = c1 / c2
    # How many relevant items are selected?
    recall = c1 / c3
    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [None]:
def get_model(input_shape):
    model = Sequential()
    
    model.add(Conv2D(filters=32, kernel_size = (5,5),padding = 'Same', activation ='relu', input_shape =input_shape))
    model.add(Conv2D(filters=32, kernel_size = (5,5), padding = 'Same', activation = 'relu'))
    model.add(MaxPool2D(pool_size=(2,2)))
    model.add(Dropout(0.25))

    model.add(Conv2D(filters=64, kernel_size = (3,3), padding = 'Same', activation = 'relu'))
    model.add(Conv2D(filters=64, kernel_size = (3,3), padding = 'Same', activation = 'relu'))
    model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
    model.add(Dropout(0.25))

    model.add(Flatten())
    model.add(Dense(units=256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='softmax'))

    model.summary()
    
    return model

In [None]:
model = get_model((28,28,1))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = [f1_score])

In [None]:
# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
    rescale=1. / 255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)