# Little/Big Endian Hex Set Analysis

In this notebook we will analyze this dataset to get a sense of the data we are dealing with.

## Imports

In [2]:
import os
import pandas as pd
import pyarrow.parquet as pq
from concurrent.futures import ThreadPoolExecutor

## Main Functions

In [9]:
# Read a single parquet file
def read_parquet_file(file_path):
    return pq.read_table(file_path).to_pandas()

# Read a directory of parquet files to a pandas dataframe
def read_parquet_dir(dir_path):
    df = pd.DataFrame()
    for root, dirs, files in os.walk(dir_path):
        with ThreadPoolExecutor() as executor:
            file_paths = [os.path.join(root, file) for file in files if file.endswith('.parquet')]
            results = executor.map(read_parquet_file, file_paths)
            for result in results:
                df = pd.concat([df, result])
    return df

# Iterate through all parquet files in the directory and visualize the data
def feature_rows(df):
    return df['endianness'].value_counts()

# Calculate the average row length
def average_row_length(df):
    return df['data'].apply(len).mean()

def adjacent_features(df, feature_columns):
    # Adjacent features are features that are next to the same feature in the endianness column
    endianness_list = df[feature_columns].to_numpy().tolist()
    count_list = []
    count = 0
    for i in range(len(endianness_list) - 1):
        if endianness_list[i] == endianness_list[i + 1]:
            count += 1
        else:
            count_list.append(count)
            count = 0
    return count_list

## Run Analysis

### Get Datasets

In [4]:
data_dir_train = '../dataset/train'
data_dir_test = '../dataset/test'

df_train = read_parquet_dir(data_dir_train)
df_test = read_parquet_dir(data_dir_test)

### Average Row Length

In [5]:
print(f'Average row length for train: {average_row_length(df_train)}')
print(f'Average row length for test: {average_row_length(df_test)}')

Average row length for train: 512.0
Average row length for test: 512.0


### Number of Feature Rows Per Endianness

In [6]:
print(f'Feature rows for train: {feature_rows(df_train)}')
print(f'Feature rows for test: {feature_rows(df_test)}')

Feature rows for train: endianness
little    3554031
big       3497489
Name: count, dtype: int64
Feature rows for test: endianness
big       548401
little    500845
Name: count, dtype: int64


### Adjacent Features

In [12]:
adjacent_features_test = adjacent_features(df_test, ["endianness"])
adjacent_features_train = adjacent_features(df_train, ["endianness"])
print(f'Adjacent feature list for test: {adjacent_features_test}')
print(f'Adjacent feature list for train: {adjacent_features_train}')
print(f'Average number of adjacent features for test: {sum(adjacent_features_test) / len(adjacent_features_test)}')
print(f'Average number of adjacent features for train: {sum(adjacent_features_train) / len(adjacent_features_train)}')

Adjacent feature list for test: [297, 23, 2563, 36, 0, 106, 22, 16, 402, 206, 19, 338, 1, 82, 67, 5, 21, 343, 19, 4731, 0, 17, 32, 10, 432, 1, 1, 125, 62, 466, 370, 41, 15, 608, 98, 1911, 409, 0, 40, 33, 31, 1253, 460, 28, 55, 14, 2940, 1, 2, 797, 4785, 16, 4, 14, 980, 330, 1387, 24, 77, 3, 838, 135, 134, 2254, 30, 199, 569, 17, 134, 54, 548, 8, 427, 46, 133, 33, 84, 207, 969, 0, 34, 20, 43, 12, 10, 3, 857, 25, 1768, 1061, 5, 13, 49, 319, 61, 32, 39, 63, 10, 3378, 125, 7, 45, 135, 5, 327, 29, 1003, 27, 22, 38, 0, 99, 1374, 5977, 3, 29, 2493, 285, 41, 107, 21, 104, 42, 25, 79, 35, 40, 1124, 363, 90, 30, 2, 1183, 6, 16, 28, 6047, 100, 109, 48, 18, 35, 3, 2442, 10, 647, 14, 11389, 13, 44, 14, 13, 28, 1574, 239, 186, 823, 1503, 11177, 731, 88, 976, 1623, 3, 985, 75, 436, 158, 14, 97, 8, 44, 25, 6, 170, 0, 6, 40, 31, 19, 979, 6, 364, 23, 41, 1, 46, 28, 31, 231, 62, 34, 201, 3, 606, 0, 36, 27, 167, 241, 468, 15, 567, 88, 53, 19, 0, 61, 18, 141, 102, 7, 7, 619, 14, 3491, 24, 208, 852, 9, 123,

In [15]:
df_train.head(100)

Unnamed: 0,data,endianness
0,6820 6d74 206c 5550 4c42 4349 2220 2f2d 572f ...,big
1,4 7478 682f 6d74 3b6c 6320 6168 7372 7465 553d...,big
2,74 2272 6320 6e6f 6574 746e 223d 6f44 4263 6f6...,big
3,f62 7964 6920 3d64 6422 636f 6f43 746e 6e65 22...,big
4,3c73 632f 646f 3e65 2f3c 6874 3c3e 742f 3e72 ...,big
...,...,...
95,0000 002b 0000 0029 0000 0065 0000 0066 0000 ...,little
96,64a8 1100 0000 004c 0000 004d 0000 0051 0000...,little
97,00 0000 0000 0000 0000 0000 0000 0000 0000 000...,little
98,012 0000 0461 0000 0000 0000 0000 0000 0012 00...,little
