## Preprocessing Goal
1. Load data, group the original training and testing dataset together
2. Basic statistics
3. Transform to dataframe structure:

---
- each raw represents a single frame
- each column represent its associated information
    - video
    - frame
    - ID:video name_frame ID
    - label
    - resident_x_nose
    - resident_y_nose
    - ...
    - resident_x_tail
    - resident_y_tail
 

In [2]:
#import neccessary packages
import numpy as np
import pandas as pd

In [3]:
# @title Download and unzip the data
import os, requests, zipfile

fname = 'task1_classic_classification.zip'
url = "https://data.caltech.edu/records/s0vdx-0k302/files/task1_classic_classification.zip?download=1"

if not os.path.isfile(fname):
  try:
    r = requests.get(url)
  except requests.ConnectionError:
    print("!!! Failed to download data !!!")
  else:
    if r.status_code != requests.codes.ok:
      print("!!! Failed to download data !!!")
    else:
      with open(fname, "wb") as fid:
        fid.write(r.content)
else:
  print('Data have already been downloaded!!!')

if not os.path.exists('task1_classic_classification'):
  # Unzip the file
  with zipfile.ZipFile(fname, 'r') as zip_ref:
    zip_ref.extractall('.')


# Download the script
fname = 'calms21_convert_to_npy.py'
url = "https://data.caltech.edu/records/s0vdx-0k302/files/calms21_convert_to_npy.py?download=1"

if not os.path.isfile(fname):
  try:
    r = requests.get(url)
  except requests.ConnectionError:
    print("!!! Failed to download data !!!")
  else:
    if r.status_code != requests.codes.ok:
      print("!!! Failed to download data !!!")
    else:
      with open(fname, "wb") as fid:
        fid.write(r.content)

In [12]:
def load_task1_data(data_path):
  """
  Load data for task 1:
      The vocaubulary tells you how to map behavior names to class ids;
      it is the same for all sequences in this dataset.
  """
  data_dict = np.load(data_path, allow_pickle=True).item()
  dataset = data_dict['annotator-id_0']
  # Get any sequence key.
  sequence_id = list(data_dict['annotator-id_0'].keys())[0]
  vocabulary = data_dict['annotator-id_0'][sequence_id]['metadata']['vocab']
  return dataset, vocabulary


training_data, vocab = load_task1_data('data/calms21_task1_train.npy') #check where you created the files in the loading notebook
test_data, _ = load_task1_data('data/calms21_task1_test.npy') #check where you created the files in the loading notebook


TypeError: 'dict' object cannot be interpreted as an integer

In [13]:
#group training and test data together to assemble our own data
concatenated_data = {}
concatenated_data.update(training_data)
concatenated_data.update(test_data)

print(f"Number of sequences in training data: {len(training_data)}")
print(f"Number of sequences in test data: {len(test_data)}")
print(f"Number of sequences in concatenated data: {len(concatenated_data)}")

Number of sequences in training data: 70
Number of sequences in test data: 19
Number of sequences in concatenated data: 89


In [21]:
# simplify data in a dataframe

def transform_dataset(dataset):

  sequence_names = list(dataset.keys())

  data = []

  #columns
  mice = ['resident','intruder']
  coordinates = ['x', 'y']
  bodyparts = ['nose', 'left_ear', 'right_ear', 'neck', 'left_hip', 'right_hip', 'tail_base']

  print('We have ', len(sequence_names), ' sequences')

  for sequence in sequence_names:

      for f, frame in enumerate(dataset[sequence]['keypoints']):

        tabdata = {}
        id = sequence + str(f)
        tabdata = {'sequence': sequence, 'frame': f, 'id': id}
        tabdata['label'] = dataset[sequence]['annotations'][f]

        # create a column for each mouse + coordinate + bodypart column
        for m, mouse in enumerate(frame):

          for c, coordinate in enumerate(mouse):

            for b, c_bodypart in enumerate(coordinate):
              column_name = mice[m] + '_' + coordinates[c] + '_' + bodyparts[b]
              tabdata[column_name] = c_bodypart

        data.append(tabdata)

  print('We have ', len(data), ' frames in total in the dataset')
  dataset_new = data

  return dataset_new

In [22]:
whole_data = transform_dataset(concatenated_data)
df = pd.DataFrame(whole_data)
df.head()

We have  89  sequences
We have  769845  frames in total in the dataset


Unnamed: 0,sequence,frame,id,label,resident_x_nose,resident_x_left_ear,resident_x_right_ear,resident_x_neck,resident_x_left_hip,resident_x_right_hip,...,intruder_x_left_hip,intruder_x_right_hip,intruder_x_tail_base,intruder_y_nose,intruder_y_left_ear,intruder_y_right_ear,intruder_y_neck,intruder_y_left_hip,intruder_y_right_hip,intruder_y_tail_base
0,task1/train/mouse001_task1_annotator1,0,task1/train/mouse001_task1_annotator10,3,831.659204,805.659204,775.659204,780.659204,711.659204,711.659204,...,796.915924,840.915924,766.915924,253.216902,195.216902,193.216902,179.216902,152.216902,102.216902,97.216902
1,task1/train/mouse001_task1_annotator1,1,task1/train/mouse001_task1_annotator11,1,833.050439,809.050439,778.050439,783.050439,723.050439,717.050439,...,799.907019,846.907019,766.907019,259.539977,204.539977,201.539977,188.539977,153.539977,105.539977,98.539977
2,task1/train/mouse001_task1_annotator1,2,task1/train/mouse001_task1_annotator12,1,838.718976,816.718976,776.718976,787.718976,730.718976,713.718976,...,800.195703,860.195703,777.195703,256.902935,208.902935,205.902935,193.902935,150.902935,112.902935,99.902935
3,task1/train/mouse001_task1_annotator1,3,task1/train/mouse001_task1_annotator13,1,826.757507,815.757507,774.757507,785.757507,743.757507,711.757507,...,794.788861,856.788861,786.788861,263.420539,206.420539,206.420539,193.420539,147.420539,113.420539,97.420539
4,task1/train/mouse001_task1_annotator1,4,task1/train/mouse001_task1_annotator14,1,822.045709,812.045709,768.045709,779.045709,749.045709,709.045709,...,789.578644,862.578644,793.578644,263.366469,202.366469,201.366469,190.366469,143.366469,120.366469,95.366469


In [24]:
# download the dataset to be reused
df.to_csv('calms21_task_data.csv',header=True)