In [27]:
import pandas as pd
import json
import os

# Put the Filename and Validate

In [28]:
fn = '23-50-azureO1.csv'

fn = fn.split('.')

if len(fn) == 2 :
    [filename, extension] = fn
elif len(fn) == 1:
    filename = fn[0]
    extension = 'csv'
else:
    raise ValueError('Invalid filename')

progress_file_path = f'./progress/validationProgress-{filename}.txt'
raw_data_file_path = f'./data/raw/{filename}.{extension}'

if not os.path.exists(progress_file_path):
    raise FileNotFoundError(f'Progress file not found: {progress_file_path}')
if not os.path.exists(raw_data_file_path):
    raise FileNotFoundError(f'Raw data file not found: {raw_data_file_path}')

# Read File

In [29]:
df = pd.read_csv(raw_data_file_path)

# Open and read the JSON file
with open(progress_file_path, 'r') as file:
    validate_data = json.load(file)

# Split Data

In [30]:
# Create a list of row indices marked as "ok"
ok_indices = [entry['rowIndex'] for entry in validate_data['history'] if entry['mark'] == 'ok']
weird_indices = [entry['rowIndex'] for entry in validate_data['history'] if entry['mark'] == 'weird']
skip_indices = [entry['rowIndex'] for entry in validate_data['history'] if entry['mark'] == 'skip']

# Filter the DataFrame to keep only the rows with indices in ok_indices
filtered_df = df.iloc[ok_indices]
weird_df = df.iloc[weird_indices]
skip_df = df.iloc[skip_indices]

# Analyze The Data

In [31]:
print('File:','.'.join(fn))
print('Total data shape:',df.shape[0])
print('Filtered data shape:',filtered_df.shape[0])
print('Weird data shape:',weird_df.shape[0])
print('Skipped data shape:',skip_df.shape[0])
print('Weird ratio:',validate_data['stats']['weird']/validate_data['stats']['ok'])

File: 23-50-azureO1.csv
Total data shape: 50
Filtered data shape: 41
Weird data shape: 9
Skipped data shape: 0
Weird ratio: 0.21951219512195122


# Write File

In [32]:
os.makedirs(f'./data/filtered/{filename}', exist_ok=True)
filtered_df.to_csv(f'./data/filtered/{filename}/{filename}_approved.csv', index=False)
weird_df.to_csv(f'./data/filtered/{filename}/{filename}_rejected.csv', index=False)