# Little/Big Endian Hex Set Analysis

In this notebook we will analyze this dataset to get a sense of the data we are dealing with.

## Imports

In [8]:
import os
import pandas as pd
import pyarrow.parquet as pq
from concurrent.futures import ThreadPoolExecutor

## Main Functions

In [9]:
# Read a single parquet file
def read_parquet_file(file_path):
    return pq.read_table(file_path).to_pandas()

# Read a directory of parquet files to a pandas dataframe
def read_parquet_dir(dir_path):
    df = pd.DataFrame()
    for root, dirs, files in os.walk(dir_path):
        with ThreadPoolExecutor() as executor:
            file_paths = [os.path.join(root, file) for file in files if file.endswith('.parquet')]
            results = executor.map(read_parquet_file, file_paths)
            for result in results:
                df = pd.concat([df, result])
    return df

# Iterate through all parquet files in the directory and visualize the data
def feature_rows(df):
    return df['endianness'].value_counts()

# Calculate the average row length
def average_row_length(df):
    return df['data'].apply(len).mean()

def adjacent_features(df, feature_columns):
    # Adjacent features are features that are next to the same feature in the endianness column
    endianness_list = df[feature_columns].to_numpy().tolist()
    count_list = []
    count = 0
    for i in range(len(endianness_list) - 1):
        if endianness_list[i] == endianness_list[i + 1]:
            count += 1
        else:
            count_list.append(count)
            count = 0
    return count_list

## Run Analysis

### Get Datasets

In [10]:
data_dir_train = '../dataset/train'
data_dir_test = '../dataset/test'

df_train = read_parquet_dir(data_dir_train)
df_test = read_parquet_dir(data_dir_test)

### Average Row Length

In [11]:
print(f'Average row length for train: {average_row_length(df_train)}')
print(f'Average row length for test: {average_row_length(df_test)}')

Average row length for train: 512.0
Average row length for test: 512.0


### Number of Feature Rows Per Endianness

In [5]:
print(f'Feature rows for train: {feature_rows(df_train)}')
print(f'Feature rows for test: {feature_rows(df_test)}')

Feature rows for train: endianness
little    437687
big       430653
Name: count, dtype: int64
Feature rows for test: endianness
big       67368
little    61448
Name: count, dtype: int64


### Adjacent Features

In [6]:
adjacent_features_test = adjacent_features(df_test, ["endianness"])
adjacent_features_train = adjacent_features(df_train, ["endianness"])
print(f'Adjacent feature list for test: {adjacent_features_test}')
print(f'Adjacent feature list for train: {adjacent_features_train}')
print(f'Average number of adjacent features for test: {sum(adjacent_features_test) / len(adjacent_features_test)}')
print(f'Average number of adjacent features for train: {sum(adjacent_features_train) / len(adjacent_features_train)}')

Adjacent feature list for test: [469, 1, 121, 40, 170, 2, 8, 49, 4, 37, 6, 2, 3, 428, 14, 0, 4, 56, 2, 124, 2, 1, 15, 170, 749, 310, 32, 3, 10, 1, 12, 4, 2, 7, 3, 4, 137, 41, 10, 150, 2, 755, 10, 12, 5, 1, 308, 0, 79, 0, 1422, 0, 3, 0, 0, 1, 194, 28, 22, 101, 184, 1396, 87, 10, 118, 323, 6, 52, 18, 0, 13, 22, 4, 2, 0, 166, 0, 8, 2, 2, 26, 6, 3, 103, 2, 20, 28, 57, 1, 69, 10, 4, 8, 1, 16, 10, 0, 0, 74, 0, 435, 2, 25, 103, 0, 13, 52, 2, 12, 43, 2, 0, 37, 0, 19, 35, 5, 29, 19, 141, 0, 86, 2, 249, 2, 21, 44, 1, 1, 181, 3, 419, 8, 7, 10, 115, 0, 132, 20, 177, 4, 6, 18, 5, 14, 31, 28, 11, 0, 1, 4, 1, 16, 3, 10, 1, 156, 12, 20, 24, 20, 81, 11, 11, 453, 2, 1, 104, 6, 152, 6, 9, 0, 97, 70, 1, 35, 36, 3, 15, 136, 348, 15, 6, 2, 2, 7, 0, 652, 20, 11, 2, 15, 15, 46, 5, 139, 10, 42, 1, 223, 51, 5, 75, 1, 77, 6, 1, 24, 6, 460, 3, 277, 2, 2, 8, 6, 3, 42, 2, 0, 99, 2, 1, 15, 11, 1, 49, 36, 9, 5, 75, 143, 15, 4, 519, 5, 12, 1, 1, 38, 21, 1022, 1, 112, 1, 0, 2, 2, 10, 2, 3, 123, 126, 30, 2, 94, 14, 20, 

In [7]:
df_train.head(100)

Unnamed: 0,data,endianness
0,074 7270 786f 5f79 5f43 6547 4d74 6365 6168 6...,big
1,e75 6973 6e67 6465 6920 746e 4300 5f4b 5f43 65...,big
2,56 6972 7966 6946 616e 006c 7270 786f 5f79 5f...,big
3,5f 7465 654d 6863 6e61 7369 496d 666e 006f 317...,big
4,6 616e 006c 3150 5f31 5052 5f43 4143 4c4c 435f...,big
...,...,...
95,3136 435f 445f 6769 7365 5574 6470 7461 0065...,big
96,6e69 6c61 6600 7869 6465 3132 435f 565f 7265 ...,big
97,3364 5f32 5f43 6953 6e67 6e49 7469 6600 7869 6...,big
98,3464 5f35 5f43 6944 6567 7473 654b 0079 6966 6...,big
