In [1]:
from torch.utils.data import Dataset,DataLoader
import pandas as pd
import json
import os
import pyarrow.parquet as pq

In [2]:
import psutil

def get_storage_space():
    # Get disk usage statistics
    disk_usage = psutil.disk_usage('/')

    # Extract relevant information
    total_space = disk_usage.total  # Total storage space
    used_space = disk_usage.used    # Used storage space
    free_space = disk_usage.free    # Free storage space
    percent_used = disk_usage.percent  # Percentage of used space

    # Print or return the information
    print(f"Total Space: {total_space / (1024 ** 3):.2f} GB")
    print(f"Used Space: {used_space / (1024 ** 3):.2f} GB")
    print(f"Free Space: {free_space / (1024 ** 3):.2f} GB")
    print(f"Percentage Used: {percent_used:.2f}%")

# Call the function to get storage space information
get_storage_space()


Total Space: 31.33 GB
Used Space: 23.19 GB
Free Space: 6.52 GB
Percentage Used: 78.00%


In [16]:
#%pip install fastparquet
#%pip install pyarrow

In [7]:
file_path = "/workspace/Cohort8-Ransom-Kuti-Ladipo/linguify_yb/data/raw/train_landmarks/105143404.parquet"
parquet_file = pd.read_parquet("/workspace/Cohort8-Ransom-Kuti-Ladipo/linguify_yb/data/raw/train_landmarks/105143404.parquet")
meta_data = pd.read_csv("/workspace/Cohort8-Ransom-Kuti-Ladipo/linguify_yb/data/raw/train.csv")


In [3]:
parquet_file.head(2)

Unnamed: 0_level_0,frame,x_face_0,x_face_1,x_face_2,x_face_3,x_face_4,x_face_5,x_face_6,x_face_7,x_face_8,...,z_right_hand_11,z_right_hand_12,z_right_hand_13,z_right_hand_14,z_right_hand_15,z_right_hand_16,z_right_hand_17,z_right_hand_18,z_right_hand_19,z_right_hand_20
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1784552841,0,0.49232,0.482797,0.485312,0.471907,0.481505,0.480572,0.478806,0.412168,0.475747,...,-0.096554,-0.104598,-0.074198,-0.107076,-0.111353,-0.108775,-0.09011,-0.112195,-0.111315,-0.1076
1784552841,1,0.492477,0.481592,0.48395,0.470245,0.480239,0.479128,0.47689,0.411729,0.473802,...,-0.05266,-0.054204,-0.052768,-0.063956,-0.054388,-0.046519,-0.065723,-0.066215,-0.052121,-0.041132


In [4]:
meta_data.head(2)

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah


In [5]:
FRAME_LEN = 128

LPOSE = [13,15,17,19,21]
RPOSE = [14,16,18,20,22]
POSE = LPOSE+RPOSE

X = [f'x_right_hand_{i}' for i in range(21)] + [f'x_left_hand_{i}' for i in range(21)] + [f'x_pose{i}' for i in POSE]
Y  = [f'y_right_hand_{i}' for i in range(21)] + [f'y_left_hand_{i}' for i in range(21)] + [f'y_pose{i}' for i in POSE]
Z = [f'z_right_hand_{i}' for i in range(21)] + [f'z_left_hand_{i}' for i in range(21)] + [f'z_pose{i}' for i in POSE]

FEATURE_COLIMNS = X+Y+Z

X_IDX = [i for i, col in enumerate(FEATURE_COLIMNS) if "x_" in col]
Y_IDX = [i for i, col in enumerate(FEATURE_COLIMNS) if "y_" in col]
Z_IDX = [i for i, col in enumerate(FEATURE_COLIMNS) if "z_" in col]

RHAND_IDX = [i for i, col in enumerate(FEATURE_COLIMNS) if "right" in col]
LHAND_IDX = [i for i, col in enumerate(FEATURE_COLIMNS) if "leftt" in col]
RPOSE_IDX = [i for i, col in enumerate(FEATURE_COLIMNS) if "pose" in col and int(col[-2:]) in RPOSE]
LPOSE_IDX = [i for i, col in enumerate(FEATURE_COLIMNS) if "pose" in col and int(col[-2:]) in LPOSE]


In [6]:
sample_id = 105143404
sample = meta_data.loc[meta_data["file_id"]==sample_id]
sample

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase
999,train_landmarks/105143404.parquet,105143404,1784552841,188,https://www.xn--pearol-xwa.org
1000,train_landmarks/105143404.parquet,105143404,1784574169,89,+44-645-14-4280-31
1001,train_landmarks/105143404.parquet,105143404,1784587545,219,dan navarro
1002,train_landmarks/105143404.parquet,105143404,1784630915,81,mephimle.com/shine-papeleria/
1003,train_landmarks/105143404.parquet,105143404,1784672555,21,facartes.uniandes.edu.co
...,...,...,...,...,...
1994,train_landmarks/105143404.parquet,105143404,1816603189,70,6114 mueschke road
1995,train_landmarks/105143404.parquet,105143404,1816654556,246,2762 valmora dr
1996,train_landmarks/105143404.parquet,105143404,1816664295,107,1481 carson williams
1997,train_landmarks/105143404.parquet,105143404,1816689381,153,6401 west pleasant lake drive


In [None]:
saved_df = pq.read_table(file_path, columns= ['sequence_id'] + FEATURE_COLUMNS).to_pandas()
saved_df