In [1]:
import pandas as pd 
import os 

In [2]:
df = pd.read_csv('0_radiology_cleaned.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,context,impression
0,0,examination: chest (pa and lat) indication: wi...,impression: no acute cardiopulmonary process.
1,1,examination: liver or gallbladder us (single o...,impression: 1. nodular appearance of the liver...
2,2,"indication: hcv cirrhosis c b ascites, hiv on ...",impression: successful uncomplicated ultrasoun...
3,3,examination: ultrasound-guided paracentesis. i...,impression: uneventful therapeutic paracentesi...
4,4,examination: paracentesis indication: year old...,"impression: 4.75 l of slightly cloudy, blood t..."


In [4]:
df.shape

(2321355, 3)

In [5]:
import pandas as pd

def count_blank_impressions(df, column='impression'):
    """
    Counts and filters rows where the text starts with 'impression:' and
    nothing follows (or only whitespace follows).

    Args:
        df (pd.DataFrame): Input dataframe.
        column (str): Name of the column containing impression text.

    Returns:
        tuple: (count, filtered_df) where
            count = number of blank impression rows
            filtered_df = dataframe containing only those rows
    """
    mask = df[column].str.match(r'^\s*impression:\s*$', na=False)
    blank_rows = df[mask]
    return len(blank_rows), blank_rows


In [6]:
count, blank_df = count_blank_impressions(df)
print(f"Number of blank impressions: {count}")
print(blank_df)

Number of blank impressions: 290413
         Unnamed: 0                                            context  \
62               62  indication: woman with liver disease, now with...   
74               74  bilateral breast ultrasound: please refer to c...   
82               82  indication: year old woman with right knee pai...   
84               84  addendum: there is bilateral sclerosis in the ...   
95               95  history: low back pain. lumbar spine, two view...   
...             ...                                                ...   
2321345     2321345  history: right ij placement. findings: in comp...   
2321346     2321346  chest, portable ap reason for exam: man with s...   
2321347     2321347  history: desaturation. findings: in comparison...   
2321348     2321348  indication: male with respiratory distress and...   
2321353     2321353  portable chest of comparison: radiograph. find...   

          impression  
62       impression:  
74       impression:  
82    

In [7]:
len(df)

2321355

In [8]:
import pandas as pd

def filter_nonblank_impressions(df, column='impression'):
    """
    Keeps only rows where 'impression:' is followed by some non-empty text.

    Args:
        df (pd.DataFrame): Input dataframe.
        column (str): Name of the column containing impression text.

    Returns:
        pd.DataFrame: Filtered dataframe with only non-blank impressions.
    """
    # Boolean mask: True for non-blank impressions
    mask_nonblank = ~df[column].str.match(r'^\s*impression:\s*$', na=False)
    return df[mask_nonblank]


In [9]:
df_filter = filter_nonblank_impressions(df)

In [10]:
df_filter.shape

(2030942, 3)

In [11]:
len(df_filter)

2030942

In [12]:
from sklearn.model_selection import train_test_split


In [13]:
OUTPUT_DIR = "splits_cleantext_no_blanks_datastore_size_sweep_publish"
VAL_FRAC = 0.1
TEST_FRAC = 0.1
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [14]:
# ---------- STEP 3: Create train / val / test splits ----------
train_df, temp_df = train_test_split(df_filter, test_size=VAL_FRAC + TEST_FRAC, random_state=42, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size=TEST_FRAC / (VAL_FRAC + TEST_FRAC), random_state=42)

print(f"üß† Train: {len(train_df)} | üß™ Val: {len(val_df)} | üî¨ Test: {len(test_df)}")

# ---------- STEP 4: Save splits ----------
train_df.to_csv(os.path.join(OUTPUT_DIR, "train.csv"), index=False)
val_df.to_csv(os.path.join(OUTPUT_DIR, "val.csv"), index=False)
test_df.to_csv(os.path.join(OUTPUT_DIR, "test.csv"), index=False)



üß† Train: 1624753 | üß™ Val: 203094 | üî¨ Test: 203095


In [15]:
def create_subsamples(folder, seed=42, fractions=(0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2, 0.3)):
    """
    Subsample 10%, 20%, and 30% from train/val/test CSVs in `folder`
    and save them as new CSVs with suffixes _10p, _20p, _30p.
    """
    files = ["train.csv", "val.csv", "test.csv"]
    for file in files:
        path = os.path.join(folder, file)
        if not os.path.exists(path):
            print(f"‚ö†Ô∏è Skipping {file} ‚Äî not found.")
            continue

        df = pd.read_csv(path)
        print(f"üìÑ Loaded {file} ({len(df):,} rows)")

        for frac in fractions:
            subset = df.sample(frac=frac, random_state=seed)
            out_name = file.replace(".csv", f"_{int(frac*100)}p.csv")
            out_path = os.path.join(folder, out_name)
            subset.to_csv(out_path, index=False)
            print(f"‚úÖ Saved {len(subset):,} rows ‚Üí {out_path}")

    print("üéâ All subsamples created.")


# Example usage:
create_subsamples("splits_cleantext_no_blanks")

üìÑ Loaded train.csv (1,624,753 rows)
‚úÖ Saved 16,248 rows ‚Üí splits_cleantext_no_blanks/train_1p.csv
‚úÖ Saved 32,495 rows ‚Üí splits_cleantext_no_blanks/train_2p.csv
‚úÖ Saved 48,743 rows ‚Üí splits_cleantext_no_blanks/train_3p.csv
‚úÖ Saved 64,990 rows ‚Üí splits_cleantext_no_blanks/train_4p.csv
‚úÖ Saved 81,238 rows ‚Üí splits_cleantext_no_blanks/train_5p.csv
‚úÖ Saved 97,485 rows ‚Üí splits_cleantext_no_blanks/train_6p.csv
‚úÖ Saved 113,733 rows ‚Üí splits_cleantext_no_blanks/train_7p.csv
‚úÖ Saved 129,980 rows ‚Üí splits_cleantext_no_blanks/train_8p.csv
‚úÖ Saved 146,228 rows ‚Üí splits_cleantext_no_blanks/train_9p.csv
‚úÖ Saved 162,475 rows ‚Üí splits_cleantext_no_blanks/train_10p.csv
‚úÖ Saved 324,951 rows ‚Üí splits_cleantext_no_blanks/train_20p.csv
‚úÖ Saved 487,426 rows ‚Üí splits_cleantext_no_blanks/train_30p.csv
üìÑ Loaded val.csv (203,094 rows)
‚úÖ Saved 2,031 rows ‚Üí splits_cleantext_no_blanks/val_1p.csv
‚úÖ Saved 4,062 rows ‚Üí splits_cleantext_no_blanks/val_2p.csv

In [None]:
# Optional: also save plain text files (1 line per record)
#train_df["clean_text"].to_csv(os.path.join(OUTPUT_DIR, "train.txt"), index=False, header=False)
#val_df["clean_text"].to_csv(os.path.join(OUTPUT_DIR, "val.txt"), index=False, header=False)
#test_df["clean_text"].to_csv(os.path.join(OUTPUT_DIR, "test.txt"), index=False, header=False)

#print("üíæ Saved all splits to:", OUTPUT_DIR)