In [1]:
import json 
import os
import collections
import tensorflow as tf 
import re
import h5py
import argparse
import sys 
import numpy as np 
import pandas as pd
import pickle

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Preprocessing for Q/A module

In [2]:
FLAGS = None
BUFFER_TOKENS = ['<NULL>', '<START>', '<END>', '<UNK>']

In [3]:
def _parse_sentence(s):
    s = s.replace('.', '')
    s = s.replace(',', '')
    s = s.replace('"', '')
    s = s.replace("'", '')
    s = s.replace("?", '')
    s = s.lower()
    s = re.sub("\s\s+", " ", s)
    s = s.split(' ')
    return s

In [5]:
def _create_init_dic(filename):
    df=pd.read_csv(filename,sep='|',header=None)
    df=df[[0,1,2]]
    df.columns = ['ImageNo', 'Question','Answer']
    df.reset_index()
    #bool_mat=[]
    #for i in range(0,len(df)):
    #    bool_mat.append(df['Question'].iloc[i][len(df['Question'].iloc[i])-1]=='?')
    #df=df[bool_mat]
    df['Q_parsed']=[ _parse_sentence(s) for s in df['Question']]
    df['A_parsed']=[ _parse_sentence(s) for s in df['Answer']]
    return df
#Training Dataset
dic_df=_create_init_dic("C:/Users/b160544me/Desktop/siva/project/dataset/VQAMed2019Training/QAPairsByCategory/C4_Abnormality_train.txt")
#Test Dataset
dic_v_df=_create_init_dic("C:/Users/b160544me/Desktop/siva/project/dataset/VQAMed2019Validation/QAPairsByCategory/C4_Abnormality_val.txt")

In [6]:
print(np.shape(dic_df))
print(np.shape(dic_v_df))

(3192, 5)
(500, 5)


In [7]:
list_of_all_words=[]
for i in range(0,len(dic_df)):
    list_of_all_words=list_of_all_words+dic_df['A_parsed'].iloc[i]+dic_df['Q_parsed'].iloc[i]

In [8]:
counter = collections.Counter(list_of_all_words)
TOTAL_VOCAB=len(counter)
vocab = counter.most_common(TOTAL_VOCAB)

In [9]:
## create word_to_idx, and idx_to_word
vocab = [i[0] for i in vocab]
word_to_idx = {}
idx_to_word = {}
# add in BUFFER_TOKENS
for i in range(len(BUFFER_TOKENS)):
    idx_to_word[int(i)] = BUFFER_TOKENS[i]
    word_to_idx[BUFFER_TOKENS[i]] = i

for i in range(len(vocab)):
    word_to_idx[vocab[i]] = i + len(BUFFER_TOKENS)
    idx_to_word[int(i + len(BUFFER_TOKENS))] = vocab[i]


In [10]:
pickle.dump(word_to_idx, open('C:/Users/b160544me/Desktop/siva/project/word_to_idx.pkl', 'wb') )
pickle.dump(idx_to_word, open('C:/Users/b160544me/Desktop/siva/project/idx_to_word.pkl', 'wb') )

In [11]:
PADDING_LEN=22
def _convert_sentence_to_numbers(s):
    """Convert a sentence s (a list of words) to list of numbers using word_to_idx"""
    UNK_IDX = BUFFER_TOKENS.index('<UNK>')
    NULL_IDX = BUFFER_TOKENS.index('<NULL>')
    END_IDX = BUFFER_TOKENS.index('<END>')
    STR_IDX = BUFFER_TOKENS.index('<START>')
    s_encoded = [word_to_idx.get(w, UNK_IDX) for w in s]
    s_encoded = [STR_IDX] + s_encoded
    s_encoded += [END_IDX]
    s_encoded += [NULL_IDX] * (PADDING_LEN - 1 - len(s_encoded))
    return s_encoded

In [12]:
df_v_final=dic_v_df
all_answers = [_convert_sentence_to_numbers(s) for s in np.array(df_v_final['A_parsed'])] # list of numbers 
print(len(all_answers))
for i in range(len(all_answers)):
    if(len(all_answers[i])!=PADDING_LEN-1):
        print(i)

500


In [13]:
df_final=dic_df
all_answers = [_convert_sentence_to_numbers(s) for s in np.array(df_final['A_parsed'])] # list of numbers 
valid_rows1 = [i for i in range(len(all_answers)) if len(all_answers[i]) == PADDING_LEN-1]
df_final=df_final.iloc[valid_rows1,:]
df_final['A_Encoded']=[row for row in all_answers if len(row) == PADDING_LEN-1]
all_questions = [_convert_sentence_to_numbers(s) for s in np.array(df_final['Q_parsed'])] 
valid_rows2 = [i for i in range(len(all_questions)) if len(all_questions[i]) == PADDING_LEN-1]
df_final=df_final.iloc[valid_rows2,:]
df_final['Q_Encoded']=[row for row in all_questions if len(row) == PADDING_LEN-1]

In [14]:
df_v_final=dic_v_df
all_answers = [_convert_sentence_to_numbers(s) for s in np.array(df_v_final['A_parsed'])] # list of numbers 
valid_rows3 = [i for i in range(len(all_answers)) if len(all_answers[i]) == PADDING_LEN-1]
df_v_final=df_v_final.iloc[valid_rows3,:]
df_v_final['A_Encoded']=[row for row in all_answers if len(row) == PADDING_LEN-1]
all_questions = [_convert_sentence_to_numbers(s) for s in np.array(df_v_final['Q_parsed'])] 
valid_rows4 = [i for i in range(len(all_questions)) if len(all_questions[i]) == PADDING_LEN-1]
df_v_final=df_v_final.iloc[valid_rows4,:]
df_v_final['Q_Encoded']=[row for row in all_questions if len(row) == PADDING_LEN-1]

In [15]:
print(np.shape(dic_df))
print(np.shape(dic_v_df))
print(np.shape(valid_rows1))
print(np.shape(valid_rows2))
print(np.shape(valid_rows3))
print(np.shape(valid_rows4))

(3192, 5)
(500, 5)
(3192,)
(3192,)
(500,)
(500,)


In [16]:
df_v_final.head()

Unnamed: 0,ImageNo,Question,Answer,Q_parsed,A_parsed,A_Encoded,Q_Encoded
0,synpic54733,what is the primary abnormality in this image?,paraganglioma,"[what, is, the, primary, abnormality, in, this...",[paraganglioma],"[1, 439, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 5, 4, 6, 16, 10, 7, 9, 8, 2, 0, 0, 0, 0, 0..."
1,synpic25647,is there an abnormality in the x-ray?,no,"[is, there, an, abnormality, in, the, x-ray]",[no],"[1, 46, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 4, 44, 133, 10, 7, 6, 21, 2, 0, 0, 0, 0, 0..."
2,synpic35681,is there an abnormality in the mri?,no,"[is, there, an, abnormality, in, the, mri]",[no],"[1, 46, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 4, 44, 133, 10, 7, 6, 17, 2, 0, 0, 0, 0, 0..."
3,synpic39641,what is the primary abnormality in this image?,ganglion cyst,"[what, is, the, primary, abnormality, in, this...","[ganglion, cyst]","[1, 1630, 24, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 5, 4, 6, 16, 10, 7, 9, 8, 2, 0, 0, 0, 0, 0..."
4,synpic35693,is this a normal mri?,yes,"[is, this, a, normal, mri]",[yes],"[1, 39, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 4, 9, 72, 38, 17, 2, 0, 0, 0, 0, 0, 0, 0, ..."


In [17]:
#save preprocessed training data frame
df_final.to_pickle("C:/Users/b160544me/Desktop/siva/project/train_df_final.pkl")

In [18]:
#save preprocessed test data frame
df_v_final.to_pickle("C:/Users/b160544me/Desktop/siva/project/test_df_v_final.pkl")

# Image Related

In [19]:
#Take input as preprocssed images (features) and select valid rows based on above selection during training
file = open('C:/Users/b160544me/Desktop/siva/project/image_feature_train.pkl', 'rb')
features = pickle.load(file)
features = np.array(features)
features = features[valid_rows1,]
features = features[valid_rows2,]

In [20]:
pickle.dump(features, open('C:/Users/b160544me/Desktop/siva/project/image_feature_train.pkl','wb'))

In [21]:
#Take input as preprocssed images (features) and select valid rows based on above selection during testing
file = open('C:/Users/b160544me/Desktop/siva/project/image_feature_test.pkl', 'rb')
features = pickle.load(file)
features = np.array(features)
features = features[valid_rows3,]
features = features[valid_rows4,]

In [22]:
pickle.dump(features, open('C:/Users/b160544me/Desktop/siva/project/image_feature_test.pkl','wb'))