In [5]:
import pandas as pd
import os

# Define directories to process
directories = [
    '../text_and_seq_fuzzed_ambiguity_codes/',
    '../text_and_seq_fuzzed_random_character_insertion/',
    '../text_and_seq_fuzzed_numerical_interleaving/',
    '../text_and_seq_fuzzed_case_and_whitespace/',
]

# Get all entry files from each directory
all_entries = {}
for entry_dir in directories:
    entry_files = sorted([f for f in os.listdir(entry_dir) if f.startswith('entry_') and f.endswith('.csv')])
    all_entries[entry_dir] = entry_files
    print(f"{entry_dir}: Found {len(entry_files)} entry files")

../text_and_seq_fuzzed_ambiguity_codes/: Found 200 entry files
../text_and_seq_fuzzed_random_character_insertion/: Found 200 entry files
../text_and_seq_fuzzed_numerical_interleaving/: Found 200 entry files
../text_and_seq_fuzzed_case_and_whitespace/: Found 200 entry files


In [6]:
# Process all entries from all directories and verify conditions
from simple_check import check

results = []
errors = []

for entry_dir in directories:
    entry_files = all_entries[entry_dir]
    
    for entry_file in entry_files:
        entry_path = os.path.join(entry_dir, entry_file)
        df = pd.read_csv(entry_path)
        
        entry_num = entry_file.replace('entry_', '').replace('.csv', '')
        
        # Check each row in the dataframe
        for idx, row in df.iterrows():
            sequence_value = row['sequence']
            text_without_dna = row['text_without_dna']
            
            # Check 1: sequence should contain DNA
            sequence_has_dna = check(sequence_value)
            if not sequence_has_dna:
                errors.append(f"{entry_dir}/entry_{entry_num}, row {idx}: sequence column does NOT contain DNA pattern")
            
            # Check 2: text_without_dna should NOT contain DNA
            text_has_dna = check(text_without_dna)
            if text_has_dna:
                errors.append(f"{entry_dir}/entry_{entry_num}, row {idx}: text_without_dna column DOES contain DNA pattern")
            
            results.append({
                'directory': entry_dir,
                'entry': entry_num,
                'row': idx,
                'sequence_has_match': sequence_has_dna,
                'text_without_dna_has_match': text_has_dna
            })

results_df = pd.DataFrame(results)
total_entries = sum(len(files) for files in all_entries.values())
print(f"Processed {len(results_df)} rows from {total_entries} entries across {len(directories)} directories")
print(f"\nSummary:")
print(f"  Sequence matches: {results_df['sequence_has_match'].sum()}/{len(results_df)}")
print(f"  Text without DNA matches: {results_df['text_without_dna_has_match'].sum()}/{len(results_df)}")
print(f"  Errors found: {len(errors)}")

# Summary by directory
print(f"\nSummary by directory:")
for entry_dir in directories:
    dir_results = results_df[results_df['directory'] == entry_dir]
    dir_sequence_matches = dir_results['sequence_has_match'].sum()
    dir_text_matches = dir_results['text_without_dna_has_match'].sum()
    print(f"  {entry_dir}:")
    print(f"    Sequence matches: {dir_sequence_matches}/{len(dir_results)}")
    print(f"    Text without DNA matches: {dir_text_matches}/{len(dir_results)}")

if errors:
    print(f"\n❌ ERRORS DETECTED:")
    for error in errors[:10]:  # Show first 10 errors
        print(f"  - {error}")
    if len(errors) > 10:
        print(f"  ... and {len(errors) - 10} more errors")
else:
    print(f"\n✅ All checks passed! Sequence column has DNA, text_without_dna has no DNA.")

Processed 800 rows from 800 entries across 4 directories

Summary:
  Sequence matches: 800/800
  Text without DNA matches: 0/800
  Errors found: 0

Summary by directory:
  ../text_and_seq_fuzzed_ambiguity_codes/:
    Sequence matches: 200/200
    Text without DNA matches: 0/200
  ../text_and_seq_fuzzed_random_character_insertion/:
    Sequence matches: 200/200
    Text without DNA matches: 0/200
  ../text_and_seq_fuzzed_numerical_interleaving/:
    Sequence matches: 200/200
    Text without DNA matches: 0/200
  ../text_and_seq_fuzzed_case_and_whitespace/:
    Sequence matches: 200/200
    Text without DNA matches: 0/200

✅ All checks passed! Sequence column has DNA, text_without_dna has no DNA.


In [7]:
# Display detailed results for verification
if len(errors) > 0:
    print("\nDetailed error breakdown:")
    error_df = pd.DataFrame([e.split(': ') for e in errors], columns=['location', 'issue'])
    print("\nBy issue type:")
    print(error_df.groupby('issue').size())
    print("\nBy directory:")
    error_df['directory'] = error_df['location'].str.split('/').str[0]
    print(error_df.groupby(['directory', 'issue']).size())
else:
    print("\nAll entries verified successfully!")
    print("\nSample verification (first 5 entries per directory):")
    for entry_dir in directories:
        dir_results = results_df[results_df['directory'] == entry_dir]
        sample_results = dir_results[dir_results['row'] == 0].head(5)
        if len(sample_results) > 0:
            print(f"\n{entry_dir}:")
            print(sample_results[['entry', 'sequence_has_match', 'text_without_dna_has_match']])


All entries verified successfully!

Sample verification (first 5 entries per directory):

../text_and_seq_fuzzed_ambiguity_codes/:
  entry  sequence_has_match  text_without_dna_has_match
0     0                True                       False
1     1                True                       False
2    10                True                       False
3   100                True                       False
4   101                True                       False

../text_and_seq_fuzzed_random_character_insertion/:
    entry  sequence_has_match  text_without_dna_has_match
200     0                True                       False
201     1                True                       False
202    10                True                       False
203   100                True                       False
204   101                True                       False

../text_and_seq_fuzzed_numerical_interleaving/:
    entry  sequence_has_match  text_without_dna_has_match
400     0               

In [8]:
# Optional: Show results dataframe
results_df


Unnamed: 0,directory,entry,row,sequence_has_match,text_without_dna_has_match
0,../text_and_seq_fuzzed_ambiguity_codes/,0,0,True,False
1,../text_and_seq_fuzzed_ambiguity_codes/,1,0,True,False
2,../text_and_seq_fuzzed_ambiguity_codes/,10,0,True,False
3,../text_and_seq_fuzzed_ambiguity_codes/,100,0,True,False
4,../text_and_seq_fuzzed_ambiguity_codes/,101,0,True,False
...,...,...,...,...,...
795,../text_and_seq_fuzzed_case_and_whitespace/,95,0,True,False
796,../text_and_seq_fuzzed_case_and_whitespace/,96,0,True,False
797,../text_and_seq_fuzzed_case_and_whitespace/,97,0,True,False
798,../text_and_seq_fuzzed_case_and_whitespace/,98,0,True,False
