# Data Cleaning 

1. We noticed that scraped content after parsing to markdown had introduced a lot of whitespaces in the string
2. We use regex to remove all extra whitespaces to store the results in a new column "md_content_cleaned"
3. This script runs fairly quickly < 10min

In [6]:
import singlestoredb as s2
import re
from tqdm import tqdm

# Define regex pattern for cleaning text: collapse runs of spaces, tabs, and newlines
whitespace_pattern = re.compile(r'[ \t\r\n]+')

def clean_text(text):
    """Collapse all runs of whitespace (spaces, tabs, newlines) and trim."""
    return whitespace_pattern.sub(' ', text).strip()

def process_rows():
    # Connect to the SingleStore database
    conn = s2.connect(
        database='knowlagent'
    )

    try:
        with conn.cursor() as cursor:
            create_cleaned_md_column = """
            ALTER TABLE {TABLE_NAME} ADD COLUMN md_content_cleaned LONGTEXT NULL;
            """
            cursor.execute(create_cleaned_md_column)
            
            # Optionally, find the initial number of rows to process for a global progress bar.
            count_query = """
                SELECT COUNT(*)
                FROM s2docs
                WHERE (md_content IS NOT NULL OR length(md_content_cleaned)>10)
                AND (md_content_cleaned IS NULL OR md_content_cleaned = '');
            """
            cursor.execute(count_query)
            total_rows = cursor.fetchone()[0]
            pbar = tqdm(total=total_rows, desc="Processing rows", unit="row")

            # Query to select rows that need processing, batched by a LIMIT clause.
            select_sql = """
                SELECT id, md_content
                FROM s2docs
                WHERE (md_content IS NOT NULL OR length(md_content_cleaned)>10)
                AND (md_content_cleaned IS NULL OR md_content_cleaned = '')
                LIMIT 100;
            """
            # Prepared update statement for each row.
            update_sql = "UPDATE s2docs SET md_content_cleaned = %s WHERE id = %s"

            while True:
                cursor.execute(select_sql)
                rows = cursor.fetchall()
                if not rows:
                    pbar.close()
                    print("No more rows to process.")
                    break

                for row in rows:
                    # Assuming the cursor returns rows as tuples: (id, md_content)
                    row_id = row[0]
                    content = row[1]
                    cleaned_content = clean_text(content) if content else None
                    
                    cursor.execute(update_sql, (cleaned_content, row_id))
                    # Update progress bar by one for each row processed.
                    pbar.update(1)

                # Commit after each batch.
                    conn.commit()

    except Exception as e:
        print("An error occurred:", e)
        conn.rollback()
    finally:
        conn.close()

In [7]:
process_rows()

Processing rows:   9%|▉         | 3/32 [00:02<00:19,  1.47row/s]


saved 2251799813690948

saved 1125899906844708


Processing rows:  12%|█▎        | 4/32 [00:02<00:12,  2.16row/s]


saved 2251799813687692

saved 2251799813687760


Processing rows:  22%|██▏       | 7/32 [00:03<00:05,  4.42row/s]


saved 2251799814000585

saved 1125899906846146


Processing rows:  28%|██▊       | 9/32 [00:03<00:04,  5.73row/s]


saved 1125899906847443

saved 2251799813687081


Processing rows:  34%|███▍      | 11/32 [00:03<00:03,  6.66row/s]


saved 2251799813687291

saved 1125899906848330


Processing rows:  41%|████      | 13/32 [00:04<00:02,  7.19row/s]


saved 2251799813690641

saved 1125899906846726


Processing rows:  44%|████▍     | 14/32 [00:04<00:02,  6.91row/s]


saved 2251799813689527

saved 2251799813691561


Processing rows:  53%|█████▎    | 17/32 [00:04<00:02,  7.19row/s]


saved 1125899906847520

saved 1125899906845895


Processing rows:  59%|█████▉    | 19/32 [00:04<00:01,  7.42row/s]


saved 2251799813691506

saved 2251799813686548


Processing rows:  66%|██████▌   | 21/32 [00:05<00:01,  7.57row/s]


saved 2251799813687508

saved 2251799813686084


Processing rows:  72%|███████▏  | 23/32 [00:05<00:01,  7.64row/s]


saved 1125899906844796

saved 1125899906845887


Processing rows:  78%|███████▊  | 25/32 [00:05<00:00,  7.63row/s]


saved 1125899906847721

saved 2251799813689838


Processing rows:  81%|████████▏ | 26/32 [00:05<00:00,  7.46row/s]


saved 1125899906848545

saved 1125899906847563


Processing rows:  91%|█████████ | 29/32 [00:06<00:00,  7.68row/s]


saved 1125899906842741

saved 1125899906845708


Processing rows:  97%|█████████▋| 31/32 [00:06<00:00,  7.76row/s]


saved 2251799813689505

saved 2251799813690576


Processing rows: 100%|██████████| 32/32 [00:06<00:00,  7.76row/s]


saved 1125899906842640

saved 2251799813686111


Processing rows: 35row [00:09,  2.02row/s]                       


saved 2251799813690948

saved 1125899906844708


Processing rows: 36row [00:09,  2.59row/s]


saved 2251799813687692

saved 2251799813687760


Processing rows: 39row [00:10,  4.56row/s]


saved 2251799814000585

saved 1125899906846146


Processing rows: 41row [00:10,  5.76row/s]


saved 1125899906847443

saved 2251799813687081


Processing rows: 43row [00:10,  6.66row/s]


saved 2251799813687291

saved 1125899906848330


Processing rows: 45row [00:10,  7.18row/s]


saved 2251799813690641

saved 1125899906846726


Processing rows: 46row [00:11,  6.86row/s]


saved 2251799813689527

saved 2251799813691561


Processing rows: 49row [00:11,  7.20row/s]


saved 1125899906847520

saved 1125899906845895


Processing rows: 51row [00:11,  7.42row/s]


saved 2251799813691506

saved 2251799813686548


Processing rows: 53row [00:11,  7.56row/s]


saved 2251799813687508

saved 2251799813686084


Processing rows: 55row [00:12,  7.64row/s]


saved 1125899906844796

saved 1125899906845887


KeyboardInterrupt: 