In [1]:
!hostname

jupyterhub-3bcf55a276a1a6-jupyterhub-0


In [2]:
import os
import torch
import pandas as pd
import argparse
from pytube import YouTube
from tqdm import tqdm
import cv2

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [4]:
kwargs = {'num_workers': 1, 'pin_memory': True} if device=='cuda' else {}
kwargs

{}

In [5]:
DIR_HOME = '/home/jupyter-admin'
DIR_DATASETS = DIR_HOME + '/datasets'
DIR_DATASET_MSASL = DIR_DATASETS + '/MS-ASL'
DIR_VIDEOS = DIR_DATASET_MSASL + '/videos'
DIR_IMAGES = DIR_DATASET_MSASL + '/images'

In [6]:
print(f'DIR_HOME: {DIR_HOME}')
print(f'DIR_DATASETS: {DIR_DATASETS}')
print(f'DIR_DATASET_MSASL: {DIR_DATASET_MSASL}')
print(f'DIR_VIDEOS: {DIR_VIDEOS}')
print(f'DIR_IMAGES: {DIR_IMAGES}')

DIR_HOME: /home/jupyter-admin
DIR_DATASETS: /home/jupyter-admin/datasets
DIR_DATASET_MSASL: /home/jupyter-admin/datasets/MS-ASL
DIR_VIDEOS: /home/jupyter-admin/datasets/MS-ASL/videos
DIR_IMAGES: /home/jupyter-admin/datasets/MS-ASL/images


In [7]:
print(f'DIR_HOME: {os.listdir(DIR_HOME)}')
print(f'DIR_DATASETS: {os.listdir(DIR_DATASETS)}')
print(f'DIR_DATASET_MSASL: {os.listdir(DIR_DATASET_MSASL)}')
print(f'DIR_VIDEOS: {os.listdir(DIR_VIDEOS)}')
print(f'DIR_IMAGES: {os.listdir(DIR_IMAGES)}')

DIR_HOME: ['Untitled.ipynb', 'shared', 'datasets', '.cache', 'tutorials', '.ipython', '.bash_logout', '.profile', '.ipynb_checkpoints', '.local', '.bashrc', '.jupyter']
DIR_DATASETS: ['.ipynb_checkpoints', 'MS-ASL']
DIR_DATASET_MSASL: ['MSASL_synonym.json', 'C-UDA-0.1_annotated_discussion.pdf', 'videos', 'MSASL_train.json', 'README.md', 'MSASL_classes.json', 'MSASL_test.json', '.ipynb_checkpoints', 'images', 'MSASL_val.json']
DIR_VIDEOS: ['friend.mp4', 'israel.mp4', 'mustache.mp4', 'deer.mp4', 'brush teeth.mp4', 'date.mp4', 'sew.mp4', 'humble.mp4', 'which.mp4', 'dryer.mp4', 'grandmother.mp4', 'orange.mp4', 'accountant.mp4', 'apartment.mp4', 'tomato.mp4', 'sister.mp4', 'meeting.mp4', 'pants.mp4', 'lawyer.mp4', 'elementary school.mp4', 'year.mp4', 'email.mp4', 'gallaudet.mp4', 'together.mp4', 'wall.mp4', 'throw.mp4', 'who.mp4', 'window.mp4', 'in.mp4', 'line up.mp4', 'meet.mp4', 'give.mp4', 'swimming.mp4', 'knife.mp4', 'dress.mp4', 'university.mp4', 'live.mp4', 'absent.mp4', 'match.mp4', 

'**url**': a url link to the video<br/>
'**start_time**': the starting point of the clip in the original video in seconds<br/>
'**end_time**': the starting point of the clip in the original video in seconds<br/>
'**label**': class (an integer between 0 to 1000)<br/>
'**signer_id**': the id of the signer<br/>
'**box**': the boudy bounding box of the signer such as [y0, x0, y1, x1] where (x0, y0) is up-left corner and (x1,y1) is bottom-right corner. All the values are normalized (between zero and one) according to width and height.<br/>
'**text**': the gloss for this clip which match the 'label',<br/>
'**width**': height for the original video<br/>
'**height**': height for the original video<br/>
'**fps**': frame per second for the original video<br/>

In [8]:
classes_json = pd.read_json(DIR_DATASET_MSASL + '/' + 'MSASL_classes.json')
classes_json.head()

Unnamed: 0,0
0,hello
1,nice
2,teacher
3,eat
4,no


In [9]:
synonym_json = pd.read_json(DIR_DATASET_MSASL + '/' + 'MSASL_synonym.json')
synonym_json.head()

Unnamed: 0,0,1,2,3,4
0,father,dad,daddy,,
1,mother,mom,mommy,,
2,shoes,shoe,,,
3,pants,pant,,,
4,color,colors,,,


In [10]:
train_json = pd.read_json(DIR_DATASET_MSASL + '/' + 'MSASL_train.json')
train_json.head()

Unnamed: 0,org_text,clean_text,start_time,signer_id,signer,start,end,file,label,height,fps,end_time,url,text,box,width,review
0,match [light-a-MATCH],match,0.0,0,0,0,83,match light-a-MATCH,830,360,30.0,2.767,https://www.youtube.com/watch?v=C37R_Ix8-qs,match,"[0.057544618844985004, 0.21637457609176602, 1....",640,
1,FAIL,fail,0.0,0,-1,0,74,FAIL,542,360,25.0,2.96,https://www.youtube.com/watch?v=PIsUJl8BN_I,fail,"[0.06577941775321901, 0.167171776294708, 0.939...",480,
2,laugh,laugh,0.0,4,26,0,31,SignSchool Laugh with Legs 2,312,360,29.97,1.034,www.youtube.com/watch?v=9FdHlMOnVjg,laugh,"[0.131885945796966, 0.32334136962890603, 1.0, ...",640,
3,BOOK,book,0.0,0,-1,0,66,BOOK(3),38,360,25.0,2.64,https://www.youtube.com/watch?v=J7tP98oDxqE,book,"[0.055698871612548, 0.25173279643058705, 0.996...",480,
4,sign-language,sign language,0.0,0,-1,0,75,SIGN-LANGUAGE-S-CLAW-F,848,360,29.97,2.502,www.youtube.com/watch?v=N2mG9ZKjrGA,sign language,"[0.039043992757797005, 0.24198183417320202, 1....",640,


In [11]:
test_json = pd.read_json(DIR_DATASET_MSASL + '/' + 'MSASL_test.json')
test_json.head()

Unnamed: 0,org_text,clean_text,start_time,signer_id,signer,start,end,file,label,height,fps,end_time,url,text,box,width,review
0,beer,beer,0.0,20,40,0,59,SignSchool Beer var,805,360,29.97,1.969,www.youtube.com/watch?v=wX78EPtSuzU,beer,"[0.046787232160568, 0.290409207344055, 1.0, 0....",640,
1,enjoy,enjoy,0.0,11,39,0,54,enjoy(1),192,360,29.97,1.802,www.youtube.com/watch?v=OL02Odh2dRg,enjoy,"[0.071450918912887, 0.136024981737136, 1.0, 0....",480,
2,emotional,emotional,0.0,20,40,0,30,SignSchool Emotional,907,360,23.976,1.251,www.youtube.com/watch?v=C59jcSo4fEI,emotional,"[0.059554219245910006, 0.281019657850265, 1.0,...",640,
3,key,key,0.0,92,-1,0,37,SignSchool Key,456,360,23.976,1.543,www.youtube.com/watch?v=Qs2ua1S6tg0,key,"[0.11462894082069301, 0.308987438678741, 1.0, ...",640,
4,bad,bad,0.0,11,39,0,36,BAD,81,360,29.97,1.201,www.youtube.com/watch?v=-kgTBeOw95A,bad,"[0.040453493595123007, 0.22047379612922602, 0....",480,


In [12]:
val_json = pd.read_json(DIR_DATASET_MSASL + '/' + 'MSASL_val.json')
val_json.head()

Unnamed: 0,org_text,clean_text,start_time,signer_id,signer,start,end,file,label,height,fps,end_time,url,text,box,width,review
0,absent,absent,0.0,114,-1,0,37,ASL ABSENT,837,360,28.971,1.277,https://www.youtube.com/watch?v=ri3NrdgfAtE,absent,"[0.21896389126777602, 0.008568197488784, 0.997...",202,
1,help,help,0.0,76,42,0,110,help 2,50,360,29.97,3.67,www.youtube.com/watch?v=l31UXgChCS4,help,"[0.050372719764709, 0.29941257834434504, 1.0, ...",640,
2,come on,come on,0.0,114,-1,0,41,asl come on,889,360,25.0,1.64,https://www.youtube.com/watch?v=pt9bV_EvcaU,come on,"[0.08946925401687601, 0.17948511242866502, 0.9...",480,
3,LANGUAGE,language,0.0,3,-1,0,56,LANGUAGE(3),513,360,15.0,3.733,https://www.youtube.com/watch?v=-j1wozf6o9w,language,"[0.177085787057876, 0.003668457269668, 1.0, 0....",480,
4,confused,confused,0.0,53,-1,0,95,ASL Confused,272,360,29.969,3.17,https://www.youtube.com/watch?v=y8tHmOQcCwU,confused,"[0.06262531876564001, 0.209987848997116, 1.0, ...",640,


#### Загрузка видео датасета MS-ASL

In [13]:
# def download_video(url, filedir, filename):
#     try:
#         video = YouTube(url, use_oauth=True, allow_oauth_cache=False)
#         video = video.streams.get_highest_resolution()
#         output = video.download(filedir)
#         os.rename(output, filename)
#     except:
#         print('Failed to download video' + ' ' + url)

In [14]:
# def download_data(x):
#     url = x['url']
#     extension = 'mp4'
#     name = str(x['clean_text'])
#     filename = DIR_VIDEOS + '/' + name + '.' + extension
#     filedir = DIR_VIDEOS
#     if not os.path.isfile(filename):
#         download_video(url, filedir, filename)
#     else:
#         f'File {name} already exists'

In [15]:
# train_json.apply(lambda x: download_data(x), axis = 1)

#### Videos to images

In [13]:
# Очищаем директорию, если в ней есть файлы
def clear_dir(dir_name):
    for f in os.listdir(dir_name):
        os.remove(os.path.join(dir_name, f))

In [14]:
# Создаем директорию, если отсутствует
def create_dir(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [20]:
# Разбивает видео на кадры. Каждый кадр - отдельное изображение
def video_to_images(dir_name, file_name):
    vidcap = cv2.VideoCapture(DIR_VIDEOS + '/' + file_name)
    success, image = vidcap.read()
    count = 0
    while success:
        cv2.imwrite(dir_name + '/' + 'frame%d.jpg' % count, image)  # сохранить кадр как JPEG файл      
        success,image = vidcap.read()
        count += 1    
    print(f'Frames for {file_name} are created')

In [21]:
for filename in os.scandir(DIR_VIDEOS):
    if filename.is_file():
        name = filename.name.split('.')[0]
        dir_name = DIR_IMAGES + '/' + name
        create_dir(dir_name)
        clear_dir(dir_name)
        video_to_images(dir_name, filename.name)

Frames for friend.mp4 are created
Frames for israel.mp4 are created
Frames for mustache.mp4 are created
Frames for deer.mp4 are created


KeyboardInterrupt: 

In [None]:
vidcap = cv2.VideoCapture('big_buck_bunny_720p_5mb.mp4')
success,image = vidcap.read()
count = 0
while success:
  cv2.imwrite("frame%d.jpg" % count, image)     # save frame as JPEG file      
  success,image = vidcap.read()
  print('Read a new frame: ', success)
  count += 1

In [16]:
# Список каталогов, в которых находятся слова ASL
create_labels = ['actor']

# Создаем DataFrame
data = pd.DataFrame()

In [17]:
image_formats = ['jpg', 'png'] # отбираем картинки только этих форматов
labels = []
counter = 0
for i, folder_path in tqdm(enumerate(folder_paths), total=len(folder_paths)):
    if folder_path not in create_labels:
        continue
    image_paths = os.listdir('../input/data/'+folder_path)
    label = folder_path
    # save image paths in the DataFrame
    for image_path in image_paths:
        if image_path.split('.')[-1] in image_formats:
            data.loc[counter, 'image_path'] = f"../input/data/{folder_path}/{image_path}"
            labels.append(label)
            counter += 1

NameError: name 'folder_paths' is not defined

#### One Hot Encoding

In [None]:
labels = np.array(labels)
# one-hot encode the labels
lb = LabelBinarizer()
labels = lb.fit_transform(labels)

In [None]:
if len(labels[0]) == 1:
    for i in range(len(labels)):
        index = labels[i]
        data.loc[i, 'target'] = int(index)
elif len(labels[0]) > 1:
    for i in range(len(labels)):
        index = np.argmax(labels[i])
        data.loc[i, 'target'] = int(index)

In [None]:
# shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)
print(f"Number of labels or classes: {len(lb.classes_)}")
print(f"The first one hot encoded labels: {labels[0]}")
print(f"Mapping the first one hot encoded label to its category: {lb.classes_[0]}")
print(f"Total instances: {len(data)}")
 
# save as CSV file
data.to_csv('../input/data.csv', index=False)
 
# pickle the binarized labels
print('Saving the binarized labels as pickled file')
joblib.dump(lb, '../outputs/lb.pkl')
 
print(data.head(5))

#### CNN

In [None]:
# load the binarized labels file
lb = joblib.load('../outputs/lb.pkl')

class CustomCNN(nn.Module):
    def __init__(self):
        super(CustomCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 5)
        self.conv2 = nn.Conv2d(16, 32, 5)
        self.conv3 = nn.Conv2d(32, 64, 3)
        self.conv4 = nn.Conv2d(64, 128, 5)

        self.fc1 = nn.Linear(128, 256)
        self.fc2 = nn.Linear(256, len(lb.classes_))

        self.pool = nn.MaxPool2d(2, 2)
        
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))
        bs, _, _, _ = x.shape
        x = F.adaptive_avg_pool2d(x, 1).reshape(bs, -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x