## Handy Utils to Inspect a Parquet File

In [25]:

import pandas as pd
import glob

# we will load all parquet files in this dir
INPUT_DATA_DIR = 'data/granite-docs/output_final'

print ('Loading data from : ', INPUT_DATA_DIR)

# Get a list of all Parquet files in the directory
parquet_files = glob.glob(f'{INPUT_DATA_DIR}/*.parquet')
print ("Number of parquet files to read : ", len(parquet_files))
print ()

# Create an empty list to store the DataFrames
dfs = []

# Loop through each Parquet file and read it into a DataFrame
for file in parquet_files:
    df = pd.read_parquet(file)
    print (f"Read file: '{file}'.  number of rows = {df.shape[0]}")
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
data_df = pd.concat(dfs, ignore_index=True)

print (f"\nTotal number of rows = {data_df.shape[0]}")

Loading data from :  data/granite-docs/output_final
Number of parquet files to read :  1

Read file: 'data/granite-docs/output_final/Granite_Foundation_Models.parquet'.  number of rows = 216

Total number of rows = 216


In [26]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   filename                      216 non-null    object 
 1   num_pages                     216 non-null    int64  
 2   num_tables                    216 non-null    int64  
 3   num_doc_elements              216 non-null    int64  
 4   document_id                   216 non-null    object 
 5   ext                           216 non-null    object 
 6   hash                          216 non-null    object 
 7   size                          216 non-null    int64  
 8   date_acquired                 216 non-null    object 
 9   pdf_convert_time              216 non-null    float64
 10  source_filename               216 non-null    object 
 11  contents                      216 non-null    object 
 12  doc_jsonpath                  216 non-null    object 
 13  page_

In [27]:
data_df

Unnamed: 0,filename,num_pages,num_tables,num_doc_elements,document_id,ext,hash,size,date_acquired,pdf_convert_time,...,docq_symbol_to_word_ratio,docq_sentence_count,docq_lorem_ipsum_ratio,docq_curly_bracket_ratio,docq_contain_bad_word,docq_bullet_point_ratio,docq_ellipsis_line_ratio,docq_alphabet_word_ratio,docq_contain_common_en_words,embeddings
0,Granite_Foundation_Models.pdf,20,13,445,6b7b64a5-e18c-4dff-8ade-8b5703db74e1,pdf,a30250ea1dbc6510350d81860e86748e99709566030a50...,357701,2024-08-06T21:56:44.353184,70.621034,...,0.0,2,0.0,0.0,False,0.0,0.0,1.000000,True,"[-0.007855933, 0.0186794, 0.04243699, -0.01503..."
1,Granite_Foundation_Models.pdf,20,13,445,6b7b64a5-e18c-4dff-8ade-8b5703db74e1,pdf,a30250ea1dbc6510350d81860e86748e99709566030a50...,357701,2024-08-06T21:56:44.353184,70.621034,...,0.0,1,0.0,0.0,False,0.0,0.0,1.000000,False,"[-0.003576741, 0.009818693, 0.03441964, -0.006..."
2,Granite_Foundation_Models.pdf,20,13,445,6b7b64a5-e18c-4dff-8ade-8b5703db74e1,pdf,a30250ea1dbc6510350d81860e86748e99709566030a50...,357701,2024-08-06T21:56:44.353184,70.621034,...,0.0,6,0.0,0.0,False,0.0,0.0,1.000000,True,"[-0.022207905, 0.0050711543, 0.022928601, -0.0..."
3,Granite_Foundation_Models.pdf,20,13,445,6b7b64a5-e18c-4dff-8ade-8b5703db74e1,pdf,a30250ea1dbc6510350d81860e86748e99709566030a50...,357701,2024-08-06T21:56:44.353184,70.621034,...,0.0,18,0.0,0.0,False,0.0,0.0,0.970370,True,"[0.003014248, -0.0033354084, 0.007243886, -0.0..."
4,Granite_Foundation_Models.pdf,20,13,445,6b7b64a5-e18c-4dff-8ade-8b5703db74e1,pdf,a30250ea1dbc6510350d81860e86748e99709566030a50...,357701,2024-08-06T21:56:44.353184,70.621034,...,0.0,4,0.0,0.0,False,0.0,0.0,0.981481,True,"[-0.040923186, -0.045934904, 0.01606972, -0.05..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,Granite_Foundation_Models.pdf,20,13,445,6b7b64a5-e18c-4dff-8ade-8b5703db74e1,pdf,a30250ea1dbc6510350d81860e86748e99709566030a50...,357701,2024-08-06T21:56:44.353184,70.621034,...,0.0,2,0.0,0.0,False,0.0,0.0,0.931034,False,"[-0.0014148784, -0.013712707, 0.01091104, -0.0..."
212,Granite_Foundation_Models.pdf,20,13,445,6b7b64a5-e18c-4dff-8ade-8b5703db74e1,pdf,a30250ea1dbc6510350d81860e86748e99709566030a50...,357701,2024-08-06T21:56:44.353184,70.621034,...,0.0,551,0.0,0.0,False,0.0,0.0,0.597804,False,"[-0.08738202, -0.048929326, -0.009466442, 0.05..."
213,Granite_Foundation_Models.pdf,20,13,445,6b7b64a5-e18c-4dff-8ade-8b5703db74e1,pdf,a30250ea1dbc6510350d81860e86748e99709566030a50...,357701,2024-08-06T21:56:44.353184,70.621034,...,0.0,7,0.0,0.0,False,0.0,0.0,0.982456,True,"[-0.06721652, -0.030914942, -0.018033946, -0.0..."
214,Granite_Foundation_Models.pdf,20,13,445,6b7b64a5-e18c-4dff-8ade-8b5703db74e1,pdf,a30250ea1dbc6510350d81860e86748e99709566030a50...,357701,2024-08-06T21:56:44.353184,70.621034,...,0.0,3,0.0,0.0,False,0.0,0.0,0.920000,False,"[-0.018961014, -0.036238465, -0.016582817, -0...."
