# <center>Data Pre-Processing</center>

## 0. Importing Libraries

In [1]:
!python --version

import warnings
warnings.filterwarnings('ignore')

Python 3.8.12


In [2]:
import cv2     
import math   
import matplotlib.pyplot as plt    
import pandas as pd
import numpy as np 
from os import path
from os import listdir
from os.path import exists
from PIL import Image as img

%matplotlib inline

## 1. Loading and combining play datasets

In [22]:
videos_csv = pd.read_csv('../data/play_data/Video.csv',sep=';')
plays_csv = pd.read_csv('../data/play_data/Play.csv',sep=';')
training_labels = pd.merge(plays_csv.iloc[:,[0,1,2,3,4,5,9,10]], videos_csv[['VideoFileName','VideoPath','PlayID','GameID']],left_on='ID', right_on='PlayID')
training_labels = training_labels[training_labels.Type.isin(['K', 'R', 'P', 'X', 'F', 'U'])]
training_labels.head()

Unnamed: 0,ID,Down,ToGo,Spot,Text,Quarter,HasBall,Type,VideoFileName,VideoPath,PlayID,GameID
0,11,2.0,3,H24,"2nd & 3 at OSU24: Saine, Brandon rush for 11 y...",1,V,R,0,2009/G3/Q1/,11,3
1,11,2.0,3,H24,"2nd & 3 at OSU24: Saine, Brandon rush for 11 y...",1,V,R,3,2009/G3/Q1/,11,3
2,15,1.0,0,H03,"1st & GOAL at OSU03: Pettrey, Aaron kick attem...",1,V,X,2,2009/G3/Q1/,15,3
3,18,1.0,10,H26,"1st & 10 at OSU26: Masoli, J. pass complete to...",1,H,P,12,2009/G4/Q1/,18,4
4,38,1.0,10,H09,"1st & 10 at OSU09: James, L. rush for loss of ...",1,H,R,27,2009/G4/Q1/,38,4


In [23]:
training_labels_grayscale = training_labels.copy()

## 2 Splitting videos into static frames

In [24]:
def extract_frames(row, label_dict, fps=10, mode='rgb'):
    video_path = '../data/videos_high_resolution/' + row[9] + 'raw/' + row[8] + '.mp4'
    label = row[7]
    if path.exists(video_path):
        cap = cv2.VideoCapture(video_path)
        frameRate = cap.get(cv2.CAP_PROP_FPS)
        x = 1
        count = 0
        while(cap.isOpened()):
            frameId = cap.get(cv2.CAP_PROP_POS_FRAMES)
            ret, frame = cap.read()
            if (ret != True):
                break
            if ((frameId % (math.ceil(frameRate) // fps)) == 0):
                filename = "../data/stadium-iopt/training_frames/" + mode + "/" + str(row[-1]) + "_" + str(row[5]) + "_" + row[8] + "_" + "frame%d.jpg" % count
                count+=1
                h, w, c = frame.shape
                y = (w-h)//2
                good = frame[:, y:y+h]
                good = cv2.resize(good,(224,224))
                if mode == 'grayscale':
                    good = cv2.cvtColor(good,cv2.COLOR_BGR2GRAY)
                cv2.imwrite(filename, good)
                label_dict[filename] = [str(row[-1]) + "_" + str(row[5]) + "_" + row[8], count, label]
        cap.release()
    return row

In [25]:
label_dict = {}
mode = 'rgb'
rgb_dir = "../data/stadium-iopt/training_frames/" + mode

file_count = len([name for name in listdir(rgb_dir) if not name.startswith(".")])

if file_count == 0:
    training_labels.apply(lambda row: extract_frames(row,label_dict,10,'rgb'), axis=1)

In [39]:
label_dict_grayscale = {}
mode = 'grayscale'
grayscale_dir = "../data/stadium-iopt/training_frames/" + mode

file_count = len([name for name in listdir(grayscale_dir) if not name.startswith(".")])

if file_count == 0:
    training_labels_grayscale.apply(lambda row: extract_frames(row,label_dict_grayscale,10,'grayscale'), axis=1)

## 3. Saving the training frames' labels

In [26]:
label_map = {
    'K': 0,
    'R': 1,
    'P': 2,
    'X': 3,
    'F': 4,
    'U': 5,
}

In [27]:
mode = 'rgb'

file_path = "../data/stadium-iopt/training_labels/" + mode + "/" + "labels.csv"

if not exists(file_path):
    labels_df = pd.DataFrame.from_dict(label_dict, orient='index').reset_index()
    labels_df.columns = ['filename','play','frame','label']
    labels_df['filename'] = labels_df['filename'].apply(lambda x: x.split('/')[-1])
    labels_df['numeric_label'] = labels_df['label'].map(label_map)

    labels_df.to_csv(file_path, index=False)
else:
    labels_df = pd.read_csv(file_path)
labels_df.head()

Unnamed: 0,filename,play,frame,label,numeric_label
0,264_1_000_frame0.jpg,264_1_000,1,K,0
1,264_1_000_frame1.jpg,264_1_000,2,K,0
2,264_1_000_frame2.jpg,264_1_000,3,K,0
3,264_1_000_frame3.jpg,264_1_000,4,K,0
4,264_1_000_frame4.jpg,264_1_000,5,K,0


In [28]:
mode = 'grayscale'


file_path = "../data/stadium-iopt/training_labels/" + mode + "/" + "labels.csv"

if not exists(file_path):
    labels_df_grayscale = pd.DataFrame.from_dict(label_dict_grayscale, orient='index').reset_index()
    labels_df_grayscale.columns = ['filename','play','frame','label']
    labels_df_grayscale['filename'] = labels_df_grayscale['filename'].apply(lambda x: x.split('/')[-1])
    labels_df_grayscale['numeric_label'] = labels_df_grayscale['label'].map(label_map)

    labels_df_grayscale.to_csv(file_path, index=False)
else:
    labels_df_grayscale = pd.read_csv(file_path)
labels_df_grayscale.head()

Unnamed: 0,filename,play,frame,label,numeric_label
0,264_1_000_frame0.jpg,264_1_000,1,K,0
1,264_1_000_frame1.jpg,264_1_000,2,K,0
2,264_1_000_frame2.jpg,264_1_000,3,K,0
3,264_1_000_frame3.jpg,264_1_000,4,K,0
4,264_1_000_frame4.jpg,264_1_000,5,K,0


## 4. Creating a numpy array with all the frames for each play

In [29]:
labels_df.play[0]

'264_1_000'

In [None]:
mode = 'rgb'

path = "../data/stadium-iopt/training_frames/" + mode + "/"
images =  [[]]
prev_play = labels_df.play[0]
play_idx = 0
plays = [labels_df.iloc[0].play]
labels = [labels_df.iloc[0].numeric_label]
char_labels = [labels_df.iloc[0].label]

rgb_np_dir = "../data/stadium-iopt/numpy_data/" + mode + "/"
file_count = len([name for name in listdir(rgb_np_dir) if not name.startswith(".")])

if file_count == 0:
    for idx, row in labels_df.iterrows():
        image = img.open(path + row.filename)
        if prev_play != row.play:
            images[play_idx] = np.asarray(images[play_idx])
            play_idx += 1
            prev_play = row.play
            images.append([])
            labels.append(row.numeric_label)
            char_labels.append(row.label)
            plays.append(row.play)
            if ((play_idx % 1000) == 0) or ((play_idx == (5667 - 1))) or (play_idx == 1):
                print("Video " + str(play_idx) + " of " + str(5667))
        images[play_idx].append(np.asarray(image))

    images[-1] = np.asarray(images[-1])    
    images = np.asarray(images)

Video 1 of 5667
Video 1000 of 5667
Video 2000 of 5667


In [None]:
print(images.shape)
print(images[0].shape)
print(images[0][0].shape)

In [None]:
print(len(labels))
print(len(char_labels))
print(len(plays))

In [None]:
# Saving the numeric labels

np.save('../data/stadium-iopt/numpy_data/' + mode + '/numeric_labels.npy', labels)

# Saving the char labels

np.save('../data/stadium-iopt/numpy_data/' + mode + '/char_labels.npy', char_labels)

# Saving the play numbers

np.save('../data/stadium-iopt/numpy_data/' + mode + '/plays.npy', plays)

In [None]:
# Saving the numpy array containing all the frames for each play

np.save('../data/stadium-iopt/numpy_data/' + mode + '/images.npy', images)

In [17]:
mode = 'grayscale'

path = "../data/stadium-iopt/training_frames/" + mode + "/"
images_grayscale =  [[]]
prev_play = labels_df_grayscale.play[0]
play_idx = 0
plays_grayscale = [labels_df_grayscale.iloc[0].play]
labels_grayscale = [labels_df_grayscale.iloc[0].numeric_label]
char_labels_grayscale = [labels_df_grayscale.iloc[0].label]

grayscale_np_dir = "../data/stadium-iopt/numpy_data/" + mode + "/"
file_count = len([name for name in listdir(rgb_np_dir) if not name.startswith(".")])

if file_count == 0:
    for idx, row in labels_df_grayscale.iterrows():
        image = img.open(path + row.filename)
        if prev_play != row.play:
            images_grayscale[play_idx] = np.asarray(images_grayscale[play_idx])
            play_idx += 1
            prev_play = row.play
            images_grayscale.append([])
            labels_grayscale.append(row.numeric_label)
            char_labels_grayscale.append(row.label)
            plays_grayscale.append(row.play)
            if play_idx % 1000 == 0 or play_idx == (5667 - 1):
                print("Video " + str(play_idx + 1) + " of " + str(5667))
        images_grayscale[play_idx].append(np.asarray(image))

    images_grayscale[-1] = np.asarray(images_grayscale[-1])    
    images_grayscale = np.asarray(images_grayscale)

KeyboardInterrupt: 

In [None]:
print(images_grayscale.shape)
print(images_grayscale[0].shape)
print(images_grayscale[0][0].shape)

In [None]:
print(len(labels_grayscale))
print(len(char_labels_grayscale))
print(len(plays_grayscale))

In [None]:
# Saving the numeric labels_grayscale

np.save('../data/stadium-iopt/numpy_data/' + mode + '/numeric_labels_grayscale.npy', labels_grayscale)

# Saving the char labels_grayscale

np.save('../data/stadium-iopt/numpy_data/' + mode + '/char_labels_grayscale.npy', char_labels_grayscale)

# Saving the play numbers

np.save('../data/stadium-iopt/numpy_data/' + mode + '/plays_grayscale.npy', plays_grayscale)

# Saving the numpy array containing all the frames for each play

np.save('../data/stadium-iopt/numpy_data/' + mode + '/images_grayscale.npy', images_grayscale)

## 5. Converting the training labels to one-hot vectors

In [None]:
one_hot_labels = np.zeros((len(labels),len(labels_df.numeric_label.unique())))
idx = 0

for label in labels:
    one_hot_labels[idx][label] = 1
    idx += 1
    
one_hot_labels = np.asarray(one_hot_labels)

In [None]:
mode = 'rgb'

# Saving the numpy array containing all the labels as one-hot encoded vectors

np.save('../data/stadium-iopt/numpy_data/' + mode + '/one_hot_labels.npy', one_hot_labels)

In [None]:
one_hot_labels_grayscale = np.zeros((len(labels_grayscale),len(labels_df_grayscale.numeric_label.unique())))
idx = 0

for label in labels_grayscale:
    one_hot_labels_grayscale[idx][label] = 1
    idx += 1
    
one_hot_labels_grayscale = np.asarray(one_hot_labels_grayscale)

In [None]:
mode = 'grayscale'

# Saving the numpy array containing all the labels as one-hot encoded vectors

np.save('../data/stadium-iopt/numpy_data/' + mode + '/one_hot_labels.npy', one_hot_labels_grayscale)

## 6. Plotting all the training frames for a play

In [None]:
for i in range(len(images[0])):
    plt.figure()
    plt.imshow(images[0][i])