# Check Raw Data Schema

In [10]:
import fastavro
import pandas as pd

# Path to your Avro file
avro_file_path = '/Users/sjtok/misophoniaAR/1-1-13_1709922553.avro'

# Initialize an empty list to collect records
records = []

# Open the Avro file and create a reader
with open(avro_file_path, 'rb') as f:
    avro_reader = fastavro.reader(f)
    
    # Optionally print the schema of the Avro file
    print("Schema:", avro_reader.writer_schema)
    
    # Iterate over records and add them to the list
    for record in avro_reader:
        records.append(record)

# Convert the list of records into a pandas DataFrame
df = pd.DataFrame(records)

# Now you can work with the DataFrame as needed, for example, print the first few rows
print(df.head())


Schema: {'version': '6.3', 'type': 'record', 'doc': 'Session file for Empatica devices.', 'name': 'empatica.format.avro.Session', 'fields': [{'doc': 'Version of the AVRO file schema.', 'name': 'schemaVersion', 'type': {'type': 'record', 'name': 'empatica.format.avro.Version', 'fields': [{'doc': 'Major version.', 'name': 'major', 'type': 'int'}, {'doc': 'Minor version.', 'name': 'minor', 'type': 'int'}, {'doc': 'Patch version.', 'name': 'patch', 'type': 'int'}]}}, {'doc': 'Version of the device firmware.', 'name': 'fwVersion', 'type': 'empatica.format.avro.Version'}, {'doc': 'Version of the device hardware.', 'name': 'hwVersion', 'type': 'empatica.format.avro.Version'}, {'doc': 'Version of the processing algorithm.', 'name': 'algoVersion', 'type': 'empatica.format.avro.Version'}, {'default': 0, 'doc': 'Delta [s] from UTC [s] time at the location of the participant.', 'name': 'timezone', 'type': 'int'}, {'doc': 'Information of the participant enrolled.', 'name': 'enrollment', 'type': {'t

In [5]:
# Accelerometer
import fastavro
import pandas as pd

# Path to your Avro file
avro_file_path = '/Users/sjtok/misophoniaAR/1-1-13_1709922553.avro'

records = []

# Open the Avro file and create a reader
with open(avro_file_path, 'rb') as f:
    avro_reader = fastavro.reader(f)
    
    # Iterate over records and add them to the list
    for record in avro_reader:
        records.append(record)

# If the DataFrame is initialized
if records:
    df = pd.DataFrame(records)
    
    # Expand the 'rawData' nested dictionary into separate columns
    raw_data = pd.json_normalize(df['rawData'])

    # Further normalization can be done for deeper nested structures like accelerometer
    # This assumes there is at least one record and accelerometer data exists
    if not raw_data.empty and 'accelerometer' in raw_data.columns:
        accelerometer_data = pd.json_normalize(raw_data['accelerometer'][0])
        print(accelerometer_data.head())
    else:
        print("No accelerometer data available")

    # Print first few rows of the expanded raw data
    print(raw_data.head())
else:
    print("No records found in the Avro file.")


No accelerometer data available
   accelerometer.timestampStart  accelerometer.samplingFrequency  \
0              1709922553262413                         64.00071   

   accelerometer.imuParams.physicalMin  accelerometer.imuParams.physicalMax  \
0                                  -16                                   16   

   accelerometer.imuParams.digitalMin  accelerometer.imuParams.digitalMax  \
0                              -32768                               32768   

                                     accelerometer.x  \
0  [-544, -518, -540, -564, -554, -537, -513, -49...   

                                     accelerometer.y  \
0  [430, 468, 442, 532, 498, 516, 479, 455, 448, ...   

                                     accelerometer.z  \
0  [1914, 1920, 1887, 1928, 1909, 1918, 1949, 192...   

   gyroscope.timestampStart  ...  temperature.samplingFrequency  \
0                         0  ...                       0.999998   

                                  temperatu

In [15]:
# Checking the schema for the wearable sensors

import fastavro
import pandas as pd

# Path to your Avro file
avro_file_path = '/Users/sjtok/misophoniaAR/1-1-13_1709922553.avro'

records = []

# Open the Avro file and create a reader
with open(avro_file_path, 'rb') as f:
    avro_reader = fastavro.reader(f)
    print("Schema:", avro_reader.writer_schema)
    
    # Iterate over records and add them to the list
    for record in avro_reader:
        records.append(record)

# Check if any records were read
if records:
    # Print keys and sub-keys for 'rawData' of the first record that contains it
    if 'rawData' in records[0]:
        raw_data = records[0]['rawData']
        print("Keys in 'rawData':", raw_data.keys())
        for key in raw_data.keys():
            if isinstance(raw_data[key], dict):
                print(f"Sub-keys for {key}:", raw_data[key].keys())
            else:
                print(f"Data type for {key}:", type(raw_data[key]))
else:
    print("No records found in the Avro file.")


Schema: {'version': '6.3', 'type': 'record', 'doc': 'Session file for Empatica devices.', 'name': 'empatica.format.avro.Session', 'fields': [{'doc': 'Version of the AVRO file schema.', 'name': 'schemaVersion', 'type': {'type': 'record', 'name': 'empatica.format.avro.Version', 'fields': [{'doc': 'Major version.', 'name': 'major', 'type': 'int'}, {'doc': 'Minor version.', 'name': 'minor', 'type': 'int'}, {'doc': 'Patch version.', 'name': 'patch', 'type': 'int'}]}}, {'doc': 'Version of the device firmware.', 'name': 'fwVersion', 'type': 'empatica.format.avro.Version'}, {'doc': 'Version of the device hardware.', 'name': 'hwVersion', 'type': 'empatica.format.avro.Version'}, {'doc': 'Version of the processing algorithm.', 'name': 'algoVersion', 'type': 'empatica.format.avro.Version'}, {'default': 0, 'doc': 'Delta [s] from UTC [s] time at the location of the participant.', 'name': 'timezone', 'type': 'int'}, {'doc': 'Information of the participant enrolled.', 'name': 'enrollment', 'type': {'t