In [1]:
import torch.utils.data as data
import numpy as np
import glob
import os
import sys
import torch

from PIL import Image
if('/opt/ros/kinetic/lib/python2.7/dist-packages' in sys.path):
    sys.path.remove('/opt/ros/kinetic/lib/python2.7/dist-packages')
import cv2


def video_to_tensor(frames):
    """Convert a ``numpy.ndarray`` to tensor.
    Converts a numpy.ndarray (T x H x W x C)
    to a torch.FloatTensor of shape (C x T x H x W)

    Args:
         pic (numpy.ndarray): Video to be converted to tensor.
    Returns:
         Tensor: Converted video.
    """
    return torch.from_numpy(frames.transpose([3,0,1,2]))

def preprocess_image(cv_frame, size):
    """
    Args:   frame - cv image
            size - tuple of (W x H)
    """
    im = cv2.resize(cv_frame, size) #compress image
    im = np.array(Image.fromarray(cv2.cvtColor(im,cv2.COLOR_BGR2RGB))) #convert to RGB image
    im = im/255 #normalize
    im = (im-0.5)/0.5
    return im

def video_loader(vid_path, frames):
    """
    Args:   vid_path - location of saved video frames
            frames - numpy array of frame number to read
    """
    resize_W = 160
    resize_H = 120
    images = []
    for frame in frames:
        image_name = "frame_{:04n}.jpg".format(frame)
        img = cv2.imread(vid_path + image_name)
        img = preprocess_image(img, (resize_W,resize_H))
        images.append(img)
    return np.array(images)

def split_frames(start,stop,length):
    """
    To match video frames with tactile,
    Input:  start - starting frame of video
            stop - last frame of video
    output: sequence of frames in batch of length.
    for ex: If length = 18, start = 77 and stop = 300 then
            video is divided into 13 clips and frame number is returned.
    """
    vid_sample = []
    tac_sample = []
    frames = np.arange(start,stop)
    groups = len(frames)//length       #we will get these many clips from given video frames,
    extra_frames = len(frames)%length  #need to fillup missing frames,
    for index, i in enumerate(frames[::length]):
        if(i+length<stop):
            vid_sample.append(np.arange(i,i+length))
            tac_sample.append(np.arange(index*length,index*length+length))
        else:  #now append missing frames
            end_frames = np.arange(i,stop)
            missing_frames = length-end_frames.shape[0]
            end_frames = np.append(end_frames, np.full((missing_frames,),stop))
            vid_sample.append(end_frames)
            tac_sample.append(np.arange(index*length,index*length+length))
    if (vid_sample==[]):
        raise ValueError("Not enough frames in a video")
    if (len(vid_sample) != groups and extra_frames and len(vid_sample) != (groups+1)):
        raise ValueError("something went wrong. Expexted {} splits, but have only {}".format(groups,len(sample)))
    return vid_sample,tac_sample

def get_offset(annotation_path, label_path):
    """
    Get the offset in samples between video and tactile data
    Args:   annotation_path - annotation.txt file containinf video frame alligned with label[0]
            label_path - path of label.txt
    """
    vid_fps = 18
    tac_fps = 16.67
    annotation = np.loadtxt(annotation_path)
    label = np.loadtxt(label_path)
#     print (annotation, label[0] * (vid_fps/tac_fps), label[2] * (vid_fps/tac_fps), int(annotation - label[0] * (vid_fps/tac_fps)),annotation_path)
    return int(annotation - label[0] * (vid_fps/tac_fps))

def get_label(pickup, drop, vid_frame, length, actual_label):
    """
    Get the lable for given clip of video
    Args:   pickup - label[0]
            drop - label[2]
            vid_frame - numpy array of video frames
            length - length of clip
            actual_lable - label[3]
    """
    label = None
    label_range = np.arange(pickup, drop)
    #set lable to maximum frames follow
    temp_label = np.zeros(length)
    temp_label[np.in1d(vid_frame,label_range)]=1.0
#     print(temp_label,vid_frame,label_range)
    if(np.where(temp_label==1.0)[0].shape[0] > length//2):
        label = actual_label
    else:
        label = 1
#     print(label)
    return np.array(label,dtype=float)

def get_annotation(path):
    return np.loadtxt(path)


def make_train_dataset(data_path,req_frame_length):
    data = []
    for path in data_path:
        front_vid_path, tac_path, pos_path, label_path, annotation_path = path[0], path[2], path[3], path[-1], path[6]
        
        # get the offset between video and tactile data
        offset = get_offset(annotation_path, label_path)
        # Read all the video frames
        frames = glob.glob(os.path.join(front_vid_path,'*.jpg'))
        #select video frame from offset to last frame
        video_frames = np.arange(offset, len(frames)+1) 
        start,stop = video_frames[0], video_frames[-1]
        if(len(video_frames) >= 430):
            raise ValueError("more image frames = {} than tactile data for video {}".format(video_frames, front_vid_path))
        #collect subsequent frame numbers
        vid_frames, tac_frames = split_frames(start, stop, req_frame_length)
        print(vid_frames)
        label = np.loadtxt(label_path)
        annotation = get_annotation(annotation_path)
        for vid_frame, tac_frame in zip(vid_frames, tac_frames):
            pickup, drop = int(label[0]*18/16.67), int(label[2]*18/16.67)
#             print(annotation, pickup,drop,annotation_path)
            sequence_label = get_label(annotation, annotation+(drop-pickup), vid_frame, req_frame_length, label[3])
            data.append((front_vid_path, tac_path, pos_path, label_path, vid_frame, tac_frame, sequence_label))
    return data

def make_test_dataset(data_path,req_frame_length):
    data = []
    for path in data_path:
        front_vid_path, tac_path, pos_path, label_path, annotation_path = path[0], path[2], path[3], path[-1], path[6]
        
        # get the offset between video and tactile data
        offset = get_offset(annotation_path, label_path)
        # Read all the video frames
        frames = glob.glob(os.path.join(front_vid_path,'*.jpg'))
        #select video frame from offset to last frame
        video_frames = np.arange(offset, len(frames)+1) 
        start,stop = video_frames[0], video_frames[-1]
        if(len(video_frames) >= 430):
            raise ValueError("more image frames = {} than tactile data for video {}".format(video_frames, front_vid_path))
        #collect subsequent frame numbers
        vid_frames, tac_frames = split_frames(start, stop, req_frame_length)
        label = np.loadtxt(label_path)
        annotation = get_annotation(annotation_path)
        pickup, drop = int(label[0]*18/16.67), int(label[2]*18/16.67)
        sequence_label = []
        for vid_frame in vid_frames:
#             print(annotation, pickup,drop,annotation_path)
            sequence_label.append(get_label(annotation, annotation+(drop-pickup), vid_frame, req_frame_length, label[3]))
        data.append((front_vid_path, tac_path, pos_path, label_path, vid_frames, tac_frames, sequence_label))
    return data

def get_location(root, split_file):
    data_path = []
    file_path = np.loadtxt(split_file,dtype=str)
    prefix = root.split(file_path[0][:32])[0]
    file_names = ['images/front_rgb/','images/left_rgb/','tactile.txt','pos.txt','label.txt','flow', 'video_grasp_timestamp.txt']
    for i,file in enumerate(file_path):
        front_video_path = prefix + file + file_names[0]
        left_video_path = prefix + file + file_names[1]
        tactile_path = prefix +"/Visual-Tactile_Dataset/tactile_data/"+file.split(file[:32])[1] + file_names[2]
        pos_path = prefix + file + file_names[3]
        label_path = prefix + file + file_names[4]
        front_flow_path = prefix + file + file_names[5] + '/' + 'front_rgb/'
        left_flow_path = prefix + file + file_names[5] + '/' + 'left_rgb/'
        annotation_path = prefix + "/Visual-Tactile_Dataset/dataset_annotations/"+file.split(file[:32])[1] + file_names[6]
        data_path.append((front_video_path, left_video_path, tactile_path, pos_path, \
                       front_flow_path, left_flow_path, annotation_path, label_path))
    return data_path



class VisualTactile(data.Dataset):
    """
    Dataset to load sequence of framses frm the video
    root = directory of ur datasets
    split_file = your test or train split .txt file
    transforms = Transform if you wants to apply to video
    frames_to_load = 18 default, sequence of frames to load
    """

    def __init__(self, root, split_file, transforms=None, frames_to_load = 18):
        self.root = root
        self.split_file = split_file
        self.transforms = transforms
        self.frames_to_load = frames_to_load
        # get path of each file data you would like to process
        self.data_path = get_location(self.root, self.split_file)
        if len(self.data_path) == 0:
            raise (RuntimeError("Found 0 files in subfolders of: " + self.root))

        # get path and frames for training data
        if("train" in split_file):
            self.clip_data = make_train_dataset(self.data_path, self.frames_to_load)
        else:
            self.video_data = make_test_dataset(self.data_path, self.frames_to_load)
            
#     def __getitem__(self,clip_index):
#         front_vid_path, tac_path, pos_path, label_path, vid_frames, tac_frames, label = self.clip_data[clip_index]
#         clip = video_loader(front_vid_path, vid_frames)
# #         tactile = tactile_loader(tac_path, tac_frames)
# #         postion = pos_loader(pos_path, tac_frames)
#         return video_to_tensor(clip), torch.from_numpy(label)

    def __getitem__(self,clip_index):
        front_vid_path, tac_path, pos_path, label_path, vid_frames, tac_frames, label = self.clip_data[clip_index]
        clip, label = self.custom_getitem(front_vid_path, vid_frames, label)
        return clip, label
        
    def __len__(self):
        return len(self.clip_data)

    def custom_getitem(self,front_vid_path, vid_frames, label, tac_path=None, pos_path=None, tac_frames=None):
        clip = video_loader(front_vid_path, vid_frames)
#         tactile = tactile_loader(tac_path, tac_frames)
#         postion = pos_loader(pos_path, tac_frames)
        if (self.transforms is not None):
            clip = self.transforms(clip)
        return video_to_tensor(clip), torch.from_numpy(label)

    def get_video_frames(self, video_index):
        front_vid_path, tac_path, pos_path, label_path, vid_frames, tac_frames, label   = self.video_data[video_index]
        data = []
        for vid, tac, lab in zip(vid_frames, tac_frames, label):
            data.append(self.custom_getitem(front_vid_path, vid, lab))
        return data, label_path

    def get_num_videos(self):
        return len(self.video_data)
# obj = VisualTactile("../../../t/Visual-Tactile_Dataset/dataset/", "../master_i3d/trainv2.txt")

# test 

In [None]:
def get_offset(annotation_path, label_path):
    vid_fps = 18
    tac_fps = 16.67
    annotation = np.loadtxt(annotation_path)
    label = np.loadtxt(label_path)
    return int(annotation - label[0] * (vid_fps/tac_fps))

path = '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/right/0/'
vid_path = '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/right/0/images/front_rgb/'
annotation_path = '../../../Visual-Tactile_Dataset/dataset_annotations/Cheez/50_432/right/0/video_grasp_timestamp.txt'


req_frame_length = 18

label = np.loadtxt(path+"label.txt")
offset = get_offset(annotation_path, path+"label.txt")
drop = label[0]*18/16.67

frames = glob.glob(os.path.join(vid_path,'*.jpg'))

sample = []
image_frames = np.arange(offset, len(frames)+1)
start,stop = image_frames[0], image_frames[-1]
if(len(image_frames) >= 400):
    raise ValueError("more image frames = {} than tactile data".format(image_frames))

split = len(image_frames)//req_frame_length
extra_frames = len(image_frames)%req_frame_length
sample, tac = split_frames(start, stop, req_frame_length)
len(sample)

In [None]:
def split_frames(start,stop,length):
    vid_sample = []
    tac_sample = []
    frames = np.arange(start,stop)
    groups = len(frames)//length       #we will get these many clips from given video frames,
    extra_frames = len(frames)%length  #need to fillup missing frames,
    for index, i in enumerate(frames[::length]):
        if(i+length<stop):
            vid_sample.append(np.arange(i,i+length))
            tac_sample.append(np.arange(index*length,index*length+length))
        else:  #now append missing frames
            end_frames = np.arange(i,stop)
            missing_frames = length-end_frames.shape[0]
            end_frames = np.append(end_frames, np.full((missing_frames,),stop))
            vid_sample.append(end_frames)
            tac_sample.append(np.arange(index*length,index*length+length))
    if (vid_sample==[]):
        raise ValueError("Not enough frames in a video")
    if (len(vid_sample) != groups and extra_frames and len(vid_sample) != (groups+1)):
        raise ValueError("something went wrong. Expexted {} splits, but have only {}".format(groups,len(sample)))
    return vid_sample,tac_sample

# Testing dataset code (for training)

In [2]:
path1 = ('../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/0/images/front_rgb/', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/0/images/left_rgb/', '../../../t/Visual-Tactile_Dataset/tactile_data/Cheez/50_432/top/0/tactile.txt', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/0/pos.txt', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/0/flow/front_rgb/', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/0/flow/left_rgb/', '../../../t/Visual-Tactile_Dataset/dataset_annotations/Cheez/50_432/top/0/video_grasp_timestamp.txt', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/0/label.txt')
path2 = ('../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/7/images/front_rgb/', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/7/images/left_rgb/', '../../../t/Visual-Tactile_Dataset/tactile_data/Cheez/50_432/top/7/tactile.txt', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/7/pos.txt', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/7/flow/front_rgb/', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/7/flow/left_rgb/', '../../../t/Visual-Tactile_Dataset/dataset_annotations/Cheez/50_432/top/7/video_grasp_timestamp.txt', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/7/label.txt')
path = [path1,path2]
data = make_train_dataset(path, 18)

[array([ 83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
        96,  97,  98,  99, 100]), array([101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
       114, 115, 116, 117, 118]), array([119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
       132, 133, 134, 135, 136]), array([137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
       150, 151, 152, 153, 154]), array([155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
       168, 169, 170, 171, 172]), array([173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185,
       186, 187, 188, 189, 190]), array([191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
       204, 205, 206, 207, 208]), array([209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,
       222, 223, 224, 225, 226]), array([227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
       240, 241, 242, 243, 244]), array([245, 246, 247, 248, 249, 250, 251, 252

# Testing dataset code (for testing)

In [None]:
path1 = ('../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/0/images/front_rgb/', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/0/images/left_rgb/', '../../../t/Visual-Tactile_Dataset/tactile_data/Cheez/50_432/top/0/tactile.txt', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/0/pos.txt', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/0/flow/front_rgb/', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/0/flow/left_rgb/', '../../../t/Visual-Tactile_Dataset/dataset_annotations/Cheez/50_432/top/0/video_grasp_timestamp.txt', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/0/label.txt')
path2 = ('../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/7/images/front_rgb/', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/7/images/left_rgb/', '../../../t/Visual-Tactile_Dataset/tactile_data/Cheez/50_432/top/7/tactile.txt', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/7/pos.txt', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/7/flow/front_rgb/', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/7/flow/left_rgb/', '../../../t/Visual-Tactile_Dataset/dataset_annotations/Cheez/50_432/top/7/video_grasp_timestamp.txt', '../../../t/Visual-Tactile_Dataset/dataset/Cheez/50_432/top/7/label.txt')
path = [path1,path2]
data = make_test_dataset(path, 18)

# Testing Videoloader

In [None]:
front_vid_path, tac_path, pos_path, label_path, vid_frames, tac_frames, label = data[0]
inp = video_loader(front_vid_path, vid_frames)
inp = video_to_tensor(inp)
inp.shape

# Testing Label

In [None]:
data[0][6]

# Testing Resnet network

In [None]:
import resnet
import torch
from torch import nn
import dataset

pretrained_path = "../../../out/resnet/resnet-18-kinetics.pth"
model = resnet.resnet18(sample_size = 112, sample_duration = 18, num_classes = 400, shortcut_type='A')

model = nn.DataParallel(model)
pretrain = torch.load(pretrained_path, map_location="cpu")
model.load_state_dict(pretrain['state_dict'])

x = torch.autograd.Variable(inp)
x = x.unsqueeze(0)
out = model(x.float())

In [None]:
fc = nn.Linear(1024, 1)
fc(out)

In [None]:
a = np.arange(18)
b = np.arange(18,36)
c = np.zeros(18)
for i,j,k in zip(a,b,c):
    print(i,j,k)
    
a = []


In [None]:
def custom_getitem(front_vid_path, vid_frames, label, tac_path=None, pos_path=None, tac_frames=None):
    clip = video_loader(front_vid_path, vid_frames)
#         tactile = tactile_loader(tac_path, tac_frames)
#         postion = pos_loader(pos_path, tac_frames)
    return video_to_tensor(clip), torch.from_numpy(label)

def get_video_frames(video_index,video_data):
    front_vid_path, tac_path, pos_path, label_path, vid_frames, tac_frames, label   = video_data[video_index]
    data = []
    for vid, tac, label in zip(vid_frames, tac_frames, label):
        print(vid)
        data.append(custom_getitem(front_vid_path, vid_frames, label))
    return data, label_path

get_video_frames(0,data)

In [None]:
TACTILE_MAX_MAGNITUDE = np.array([23655, 20662, 14496,  6475, 41133, 64793, 59317, 33177, 19897,
       62084, 49874, 29170, 42944, 14976, 12311, 14331])

def tactile_loader(path):
    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
    output =[]
    groups = []
    tactile_frame = np.loadtxt(path)
    tactile = tactile_frame.astype('float')
    tactile = tactile/TACTILE_MAX_MAGNITUDE
    #group of 4 sensors
    n=4
    out = [tactile[:,k:k+n] for k in range(0, tactile.shape[1], n)]
    for data in out:
        for i in range(data.shape[1]):
            groups.append(tactile[:324,i].reshape(18,18,1))
        temp = video_to_tensor(np.array(groups))
        output.append(temp)
        groups = []
        
        
    output =[] 
    TACTILE_TIME = 24
    UPSAMPLE_FREQ = 18
    up_samples=TACTILE_TIME*UPSAMPLE_FREQ
    with open(path, 'rb') as f:
#         tactile_frame = pd.read_csv(f,delimiter=' ', header=None)
#         tactile = tactile_frame.as_matrix()
        tactile_frame = np.loadtxt(f)
        tactile = tactile_frame.astype('float')
        tactile = signal.resample(tactile,up_samples)
        tactile[np.where(tactile<0.1)] = 0.  # remove all the negative samples with 0
        tactile = tactile/TACTILE_MAX_MAGNITUDE #normalize input in range of 0 to 1.
        for frame in frames:
            output.append(tactile[frame,:].reshape(4,4,1))
        return np.array(output)