# Preprocessing Input Variants

This notebook processes variant data from multiple sources (ClinVar, COSMIC, and MAF) to create a unified dataset for downstream analysis in the UAVarPrior project.

In [1]:
import os
import re
import numpy as np
import pandas as pd
import gzip
print('Packages loaded')

Packages loaded


In [46]:
def regex_match(start_with, end_with, folder):
    m = re.search(start_with+end_with, folder)
    if m!=None:
        return True
    else:
        return False

_maf_re = re.compile(r'\bMAF=([^;]+)')

def extract_maf_regex(info_str):
    """
    Use a regular expression to grab the MAF= value.
    """
    m = _maf_re.search(info_str)
    if not m:
        return None
    try:
        return float(m.group(1))
    except ValueError:
        return None

def classify_maf(maf):
    if maf < 0.0002:
        return 'rare'
    elif maf > 0.05:
        return 'common'
    else:
        return 'undefined'


def read_vcf(vcf_file_path):
    """
    Reads data from vcf file and parse lines into pandas dataframe
        Parameters:
            vcf_file_path: str
        
        Returns:
            pandas dataframe with columns ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
    """
    VCF_REQUIRED_COLS = ["#CHROM", "POS", "ID", "REF", "ALT"]
    with gzip.open(vcf_file_path, 'rb') as file_handle:
        lines = file_handle.readlines()
        
        # handling first few lines
        index = 0
        for index, line in enumerate(lines):
            line = line.decode('utf8')
            if '#' not in line:
                break
            if "#CHROM" in line:
                cols = line.strip().split('\t')
                if cols[:5] != VCF_REQUIRED_COLS:
                    raise ValueError(
                        "First 5 columns in file {0} were {1}. "
                        "Expected columns: {2}".format(
                            input_path, cols[:5], VCF_REQUIRED_COLS))
                index += 1
                break  
         
    # handling remaining lines of vcf file
    variants = []
    for line in lines[index:]:
        line = line.decode('utf8')
        cols = line.strip().split('\t')
        variants.append(cols)

    df = pd.DataFrame(variants)
    print(df.shape)
    df.columns = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
    return df

def remove_extra_chroms(df):
    chroms = [str(chrom) for chrom in range(1, 23)]
    df_var = pd.DataFrame()
    for chrom in chroms:
        chr_df = df[df['CHROM']==str(chrom)]
#         print(chrom, chr_df.shape)
        df_var = pd.concat([df_var, chr_df])
    return df_var


def get_unique_id_dict(df_var):
    """
    Extracts unique ID types from info column of vcf data
        Parameters:
            df_var: pandas dataframe with info as one of the columns
        
        Returns:
            dictionary of unique ids with None value as default
    """
    max_info = 0
    unique_ids = []
    for info_row in df_var['INFO'].values:
        splits = info_row.split(';')
        slen = len(splits)
        if max_info <= slen:
            max_info = slen
            for splt in splits:
                unique_ids.append(splt.split('=')[0])

    unique_ids = np.unique(np.array(unique_ids))
    uniq_id_dict = {'DBVARID': None} # Somehow this key couldn't be extracted by above code
    for uid in unique_ids:
        uniq_id_dict[uid] = None
    return uniq_id_dict

def info_row_expand(info_row, uniq_id_dict):
    """
    Expands info row by splitting it into different ids
        Parameters:
            info_row (str): info text row
            uniq_id_dict (dictionary): dictionary with unique ids
            
        Returns:
            dictionary with unique ids as keys and values extracted from info as values
    """
    temp_dict = {}
    for key, value in uniq_id_dict.items():
        temp_dict[key] = value
    keys = uniq_id_dict.keys()
        
    splits = info_row.split(';')
    for spl in splits:
        temp = spl.split('=')
        if temp[0] in keys:
            temp_dict[temp[0]] = temp[1]
        else:
            print(f'Key: {temp[0]} value: {temp[1]} are extra items')
    return np.array(list(temp_dict.values()))

In [5]:
clin_var_file = '/scratch/ml-csm/projects/fgenom/gve/data/clinVar/clinvar_20250202.vcf.gz'
df = read_vcf(clin_var_file)


df = remove_extra_chroms(df)

uniq_id_dict = get_unique_id_dict(df)
print(len(uniq_id_dict))


values = []
for info_row in df['INFO'].values:
    row_values = info_row_expand(info_row, uniq_id_dict)
    values.append(row_values)
    
values = np.array(values)
cln_var_df = pd.DataFrame(values)
cln_var_df.columns = list(uniq_id_dict.keys())
cln_var_df = pd.concat([df.iloc[:, [0, 1]], cln_var_df.loc[:, ['CLNDN']]], axis=1)
# cln_var_df = pd.concat([df.iloc[:, [0, 1]], cln_var_df], axis=1)
del df
del values
cln_var_df

(3221834, 8)
35


Unnamed: 0,CHROM,POS,CLNDN
0,1,66926,Retinitis_pigmentosa
1,1,69134,not_specified
2,1,69314,not_specified
3,1,69423,not_specified
4,1,69581,not_specified
...,...,...,...
3099325,22,50777972,not_specified
3099326,22,50777976,not_specified
3099327,22,50782204,not_specified
3099328,22,50782243,not_specified


In [6]:
cln_var_df.columns = ['chrom', 'pos', 'CLNDN']
cln_var_df['chrom'] = cln_var_df['chrom'].astype(np.int8)
cln_var_df['pos'] = cln_var_df['pos'].astype(np.uint32)
cln_var_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3099330 entries, 0 to 3099329
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   chrom   int8  
 1   pos     uint32
 2   CLNDN   object
dtypes: int8(1), object(1), uint32(1)
memory usage: 62.1+ MB


In [7]:
# Load COSMIC noncoding variant data and convert column types
cosmic_df = pd.read_parquet(
    '/scratch/ml-csm/projects/fgenom/gve/data/COSMIC/COSMIC_noncoding_var.parquet.gzip',
    columns=['chrom', 'pos']
)
cosmic_df['chrom'] = cosmic_df['chrom'].astype(np.int8)
cosmic_df['pos'] = cosmic_df['pos'].astype(np.uint32)
cosmic_df

Unnamed: 0,chrom,pos
0,1,10108
1,1,10108
2,1,10108
3,1,10151
4,1,10151
...,...,...
16727025,9,138258488
16727026,9,138258493
16727027,9,138258852
16727028,9,138258875


In [16]:
# inPath = '/scratch/ml-csm/datasets/genomics/ref-genome/human/GRCh38/ensembl/variants/processed/'
# df_maf = pd.read_parquet(inPath+'1000GENOMES-release108-maf.parquet.gz')
# df_maf.dropna(inplace=True)
df_maf

Unnamed: 0,chrom,pos,id,ref,alt,maf
0,1,10505,rs548419688,A,T,0.000200
1,1,10506,rs568405545,C,"G,T",0.000200
2,1,10511,rs534229142,G,A,0.000200
3,1,10539,rs537182016,C,A,0.000599
4,1,10542,rs572818783,C,T,0.000200
...,...,...,...,...,...,...
77605634,22,50802914,rs546768261,C,A,0.000200
77605635,22,50802958,rs568168135,C,"G,T",0.013980
77605636,22,50805735,rs199560686,A,G,0.007788
77605637,22,50805777,rs556942236,C,T,0.000200


In [47]:
inPath = '/scratch/ml-csm/datasets/genomics/ref-genome/human/GRCh38/ensembl/variants/processed/'
df_maf = pd.read_parquet(inPath+'1000GENOMES-release108-maf.parquet.gz')
df_maf.dropna(inplace=True, ignore_index=True)
df_maf['category'] = df_maf['maf'].apply(classify_maf)
rare = (df_maf['category'] == 'rare').sum()
common= (df_maf['category'] == 'common').sum()
undefined= (df_maf['category'] == 'undefined').sum()
rare, common, rare/common, undefined

(34221887, 6888992, 4.967618920155518, 36494760)

In [49]:
df_final = (
    df_maf[df_maf['category'].isin(['rare', 'common'])]
    [['chrom', 'pos']]
    .reset_index(drop=True)
)

df_final

Unnamed: 0,chrom,pos
0,1,10505
1,1,10506
2,1,10511
3,1,10542
4,1,10579
...,...,...
41110874,22,50801324
41110875,22,50802870
41110876,22,50802881
41110877,22,50802914


In [53]:
df_final['chrom'] = df_final['chrom'].astype(np.int8)
df_final['pos'] = df_final['pos'].astype(np.uint32)
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41110879 entries, 0 to 41110878
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   chrom   int8  
 1   pos     uint32
dtypes: int8(1), uint32(1)
memory usage: 196.0 MB


In [51]:
chr_vcf_folder = '/scratch/ml-csm/datasets/genomics/ref-genome/human/GRCh38/ensembl/variants/processed/chroms/'
vcf_files = os.listdir(chr_vcf_folder)

vcf_files.remove('homo_sapiens-chrX.tsv')
vcf_files.remove('homo_sapiens-chrY.tsv')
vcf_files.sort()
print(f'chromosome var vcf files: {len(vcf_files)}')


chromosome var vcf files: 22


In [54]:
top_vcf = pd.DataFrame()

for vcf_file in vcf_files:
    # Read the VCF file
    vcf = pd.read_csv(chr_vcf_folder+vcf_file, sep='\t')
    print(f'Original VCF file shape: {vcf.shape}')
    
    # Extract chromosome number from filename
    chrom = int(re.search(r'chr(\d+)', vcf_file).group(1))
    
    # Convert VCF column types to match combined_df for proper comparison
    vcf['#CHROM'] = vcf['#CHROM'].astype(np.int8)
    vcf['POS'] = vcf['POS'].astype(np.uint32)
    
    # Filter VCF based on chromosome and position from combined_df
    # First get positions from combined_df for the current chromosome
    chrom_positions = df_final[df_final['chrom'] == chrom]['pos'].values
    
    # Then filter VCF to only include rows with positions in chrom_positions
    chr_top_vcf = vcf[vcf['POS'].isin(chrom_positions)]
    
    # Concatenate with existing top_vcf dataframe
    top_vcf = pd.concat([top_vcf, chr_top_vcf])
    
    print(f'{vcf_file}: filtered to {chr_top_vcf.shape} variants based on position matching')

# Remove duplicates based on variant ID
dup_index = top_vcf.ID.duplicated()
top_vcf = top_vcf[~dup_index]
print(f'Final top_vcf after removing duplicates: {top_vcf.shape}')

# Display the first few rows of the filtered VCF data
top_vcf.head()

Original VCF file shape: (54950388, 6)
homo_sapiens-chr1.tsv: filtered to (3323924, 6) variants based on position matching
Original VCF file shape: (32448977, 6)
homo_sapiens-chr10.tsv: filtered to (2061467, 6) variants based on position matching
Original VCF file shape: (33255335, 6)
homo_sapiens-chr11.tsv: filtered to (2109264, 6) variants based on position matching
Original VCF file shape: (32146515, 6)
homo_sapiens-chr12.tsv: filtered to (1964427, 6) variants based on position matching
Original VCF file shape: (23665483, 6)
homo_sapiens-chr13.tsv: filtered to (1488275, 6) variants based on position matching
Original VCF file shape: (21621402, 6)
homo_sapiens-chr14.tsv: filtered to (1377487, 6) variants based on position matching
Original VCF file shape: (20235594, 6)
homo_sapiens-chr15.tsv: filtered to (1258974, 6) variants based on position matching
Original VCF file shape: (22226709, 6)
homo_sapiens-chr16.tsv: filtered to (1409853, 6) variants based on position matching
Original 

Unnamed: 0,#CHROM,POS,ID,REF,ALT,STRAND
357,1,10505,rs548419688,A,T,.
358,1,10506,rs568405545,C,"G,T",.
360,1,10511,rs534229142,G,A,.
380,1,10542,rs572818783,C,T,.
399,1,10579,rs538322974,C,A,.


In [55]:
top_vcf.to_csv('/scratch/ml-csm/projects/fgenom/gve/data/rare_common_MAF.tsv', sep='\t', header=None, index=False)

In [28]:
# Combine all three dataframes, keeping only chrom and pos columns

# Extract only the relevant columns from each dataframe
cln_var_subset = cln_var_df[['chrom', 'pos']]
cosmic_subset = cosmic_df[['chrom', 'pos']]
df_maf_subset = df_maf[['chrom', 'pos']]

# Concatenate all dataframes
combined_df = pd.concat([cln_var_subset, cosmic_subset, df_maf_subset], axis=0)

# Remove duplicates
combined_df = combined_df.drop_duplicates()

# Reset index
combined_df = combined_df.reset_index(drop=True)

# Add a dummy 'label' column with zeros
# combined_df['label'] = 0

# Display information about the combined dataframe
print(f"Original dataframe sizes:")
print(f"ClinVar: {cln_var_subset.shape[0]:,} rows")
print(f"COSMIC: {cosmic_subset.shape[0]:,} rows")
print(f"MAF: {df_maf_subset.shape[0]:,} rows")
print(f"Total rows before deduplication: {cln_var_subset.shape[0] + cosmic_subset.shape[0] + df_maf_subset.shape[0]:,}")
print(f"Combined dataframe: {combined_df.shape[0]:,} unique rows with {combined_df.shape[1]} columns")

# Display the first few rows of the combined dataframe
combined_df.head()

Original dataframe sizes:
ClinVar: 3,099,330 rows
COSMIC: 16,727,030 rows
MAF: 69,542,015 rows
Total rows before deduplication: 89,368,375
Combined dataframe: 77,304,080 unique rows with 2 columns


Unnamed: 0,chrom,pos
0,1,66926
1,1,69134
2,1,69314
3,1,69423
4,1,69581


In [29]:
# Check data types
# combined_df.info()

# Ensure data types are consistent
combined_df['chrom'] = combined_df['chrom'].astype(np.int8)
combined_df['pos'] = combined_df['pos'].astype(np.uint32)
# combined_df['label'] = combined_df['label'].astype(np.int8)  # Use int8 for the label column to save memory

# # Save the combined dataframe to parquet format
# output_path = '/scratch/ml-csm/projects/fgenom/gve/data/combined_variant_positions.parquet.gz'
# combined_df.to_parquet(output_path, compression='gzip')
# print(f"Combined variant positions saved to: {output_path}")

# Memory usage information
memory_usage = combined_df.memory_usage(deep=True).sum() / (1024 * 1024)
print(f"Memory usage of combined dataframe: {memory_usage:.2f} MB")

Memory usage of combined dataframe: 368.61 MB


In [27]:
output_path = '/scratch/ml-csm/projects/fgenom/gve/data/combined_variant_positions.tsv'
combined_df.to_csv(output_path, sep='\t', header=None, index=False)
combined_df.shape

(77304080, 3)

In [38]:
chr_vcf_folder = '/scratch/ml-csm/projects/fgenom/gve/data/human/ensembl/GRCh38/variant/processed/chroms/'
vcf_files = os.listdir(chr_vcf_folder)

vcf_files.remove('homo_sapiens-chrX.tsv')
vcf_files.remove('homo_sapiens-chrY.tsv')
vcf_files.sort()
print(f'chromosome var vcf files: {len(vcf_files)}')

top_vcf = pd.DataFrame()

for vcf_file in vcf_files:
    # Read the VCF file
    vcf = pd.read_csv(chr_vcf_folder+vcf_file, sep='\t')
    print(f'Original VCF file shape: {vcf.shape}')
    
    # Extract chromosome number from filename
    chrom = int(re.search(r'chr(\d+)', vcf_file).group(1))
    
    # Convert VCF column types to match combined_df for proper comparison
    vcf['#CHROM'] = vcf['#CHROM'].astype(np.int8)
    vcf['POS'] = vcf['POS'].astype(np.uint32)
    
    # Filter VCF based on chromosome and position from combined_df
    # First get positions from combined_df for the current chromosome
    chrom_positions = combined_df[combined_df['chrom'] == chrom]['pos'].values
    
    # Then filter VCF to only include rows with positions in chrom_positions
    chr_top_vcf = vcf[vcf['POS'].isin(chrom_positions)]
    
    # Concatenate with existing top_vcf dataframe
    top_vcf = pd.concat([top_vcf, chr_top_vcf])
    
    print(f'{vcf_file}: filtered to {chr_top_vcf.shape} variants based on position matching')

# Remove duplicates based on variant ID
dup_index = top_vcf.ID.duplicated()
top_vcf = top_vcf[~dup_index]
print(f'Final top_vcf after removing duplicates: {top_vcf.shape}')

# Display the first few rows of the filtered VCF data
top_vcf.head()

Original VCF file shape: (54950388, 6)
homo_sapiens-chr1.tsv: filtered to (6335460, 6) variants based on position matching
Original VCF file shape: (32448977, 6)
homo_sapiens-chr10.tsv: filtered to (3866487, 6) variants based on position matching
Original VCF file shape: (33255335, 6)
homo_sapiens-chr11.tsv: filtered to (4023851, 6) variants based on position matching
Original VCF file shape: (32146515, 6)
homo_sapiens-chr12.tsv: filtered to (3752760, 6) variants based on position matching
Original VCF file shape: (23665483, 6)
homo_sapiens-chr13.tsv: filtered to (2811679, 6) variants based on position matching
Original VCF file shape: (21621402, 6)
homo_sapiens-chr14.tsv: filtered to (2624409, 6) variants based on position matching
Original VCF file shape: (20235594, 6)
homo_sapiens-chr15.tsv: filtered to (2363241, 6) variants based on position matching
Original VCF file shape: (22226709, 6)
homo_sapiens-chr16.tsv: filtered to (2675267, 6) variants based on position matching
Original 

Unnamed: 0,#CHROM,POS,ID,REF,ALT,STRAND
34,1,10108,rs62651026,C,T,.
63,1,10151,rs1570391830,T,A,.
86,1,10175,rs1557426757,TAACC,T,.
90,1,10179,rs1312716213,CTAA,C,.
91,1,10179,rs1557426763,CTAAC,C,.


In [39]:
top_vcf.to_csv('/scratch/ml-csm/projects/fgenom/gve/data/comb_clinVar_COSMIC_MAF.tsv', sep='\t', header=None, index=False)

# Create Chunks of Variants

When dealing with large datasets, it can be beneficial to break them down into smaller, more manageable chunks. This section splits the combined variant dataset into chunks of 10 million rows each and saves them to disk.

In [56]:
# Load the combined variant dataset
df = pd.read_csv('/scratch/ml-csm/projects/fgenom/gve/data/rare_common_MAF.tsv', sep='\t', header=None)
print(f"Loaded dataset with {df.shape[0]:,} rows and {df.shape[1]} columns")
print(df.head())

Loaded dataset with 42,156,703 rows and 6 columns
   0      1            2  3    4  5
0  1  10505  rs548419688  A    T  .
1  1  10506  rs568405545  C  G,T  .
2  1  10511  rs534229142  G    A  .
3  1  10542  rs572818783  C    T  .
4  1  10579  rs538322974  C    A  .


In [57]:
import os

# Create directory for chunks if it doesn't exist
chunk_dir = '/scratch/ml-csm/projects/fgenom/gve/data/comb_chunks'
os.makedirs(chunk_dir, exist_ok=True)
print(f"Created directory for chunks: {chunk_dir}")

# Define chunk size
chunk_size = 5_000_000
total_rows = df.shape[0]

# Compute how many full chunks and the leftover
floor_chunks = total_rows // chunk_size
remainder    = total_rows % chunk_size
ceil_chunks  = (total_rows + chunk_size - 1) // chunk_size
half_size    = chunk_size // 2

# Decide on final number of chunks
if floor_chunks == 0:
    # fewer rows than one chunk
    n_chunks = 1
    print(f"Dataset smaller than one chunk ({total_rows:,} rows) → using a single chunk")
elif 0 < remainder < half_size:
    # small tail gets merged into the last full chunk
    n_chunks = floor_chunks
    print(
        f"Remainder {remainder:,} < half of chunk_size ({half_size:,}), "
        f"merging into last full chunk → {n_chunks} chunks total"
    )
else:
    # either no remainder or a big enough tail to stand alone
    n_chunks = ceil_chunks
    print(f"Will create {n_chunks} chunks of up to {chunk_size:,} rows each")

# Split & save
for i in range(n_chunks):
    start_idx = i * chunk_size
    # last chunk always goes to the end
    end_idx   = total_rows if i == n_chunks - 1 else (i + 1) * chunk_size

    chunk = df.iloc[start_idx:end_idx]
    fname = os.path.join(chunk_dir, f'variant_chunk_{i+1:03d}.tsv')
    chunk.to_csv(fname, sep='\t', header=None, index=False)
    print(f"Saved chunk {i+1}/{n_chunks}: {fname} ({len(chunk):,} rows)")

print(f"\nCompleted: Created {n_chunks} chunks in {chunk_dir}")

Created directory for chunks: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks
Remainder 2,156,703 < half of chunk_size (2,500,000), merging into last full chunk → 8 chunks total
Saved chunk 1/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_001.tsv (5,000,000 rows)
Saved chunk 2/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_002.tsv (5,000,000 rows)
Saved chunk 3/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_003.tsv (5,000,000 rows)
Saved chunk 4/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_004.tsv (5,000,000 rows)
Saved chunk 5/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_005.tsv (5,000,000 rows)
Saved chunk 6/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_006.tsv (5,000,000 rows)
Saved chunk 7/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_007.tsv (5,000,000 rows)
Saved chunk 8/8: /scratch/ml-csm/projects/fgenom/gve/d

In [9]:
# Create directory for chunks if it doesn't exist
chunk_dir = '/scratch/ml-csm/projects/fgenom/gve/data/comb_chunks'
os.makedirs(chunk_dir, exist_ok=True)
print(f"Created directory for chunks: {chunk_dir}")

# Define chunk size
chunk_size = 5_000_000

# Calculate number of chunks needed
n_chunks = (df.shape[0] + chunk_size - 1) // chunk_size  # ceiling division
print(f"Will create {n_chunks} chunks of approximately {chunk_size:,} rows each")

# Create and save chunks
for i in range(n_chunks):
    # Calculate start and end indices for this chunk
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, df.shape[0])
    
    # Extract chunk
    chunk = df.iloc[start_idx:end_idx]
    
    # Create chunk filename
    chunk_file = os.path.join(chunk_dir, f'variant_chunk_{i+1:03d}.tsv')
    
    # Save chunk
    chunk.to_csv(chunk_file, sep='\t', header=None, index=False)
    
    print(f"Saved chunk {i+1}/{n_chunks}: {chunk_file} with {len(chunk):,} rows")

print(f"\nCompleted: Created {n_chunks} chunks in {chunk_dir}")

Created directory for chunks: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks
Will create 8 chunks of approximately 10,000,000 rows each
Saved chunk 1/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_001.tsv with 10,000,000 rows
Saved chunk 2/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_002.tsv with 10,000,000 rows
Saved chunk 3/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_003.tsv with 10,000,000 rows
Saved chunk 4/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_004.tsv with 10,000,000 rows
Saved chunk 5/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_005.tsv with 10,000,000 rows
Saved chunk 6/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_006.tsv with 10,000,000 rows
Saved chunk 7/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chunks/variant_chunk_007.tsv with 10,000,000 rows
Saved chunk 8/8: /scratch/ml-csm/projects/fgenom/gve/data/comb_chun

In [10]:
# Verify chunks (optional)
# List all files in the chunk directory
chunk_files = sorted(os.listdir(chunk_dir))
print(f"Created {len(chunk_files)} chunk files:")

# Check total size of all chunks
total_rows = 0
for chunk_file in chunk_files[:5]:  # Show only first 5 for brevity
    file_path = os.path.join(chunk_dir, chunk_file)
    file_size = os.path.getsize(file_path) / (1024 * 1024)  # Size in MB
    print(f"  {chunk_file}: {file_size:.2f} MB")

# You can uncomment the following code to verify the total number of rows
# across all chunks matches the original dataframe
'''
total_rows = 0
for chunk_file in chunk_files:
    file_path = os.path.join(chunk_dir, chunk_file)
    chunk_df = pd.read_csv(file_path, sep='\t', header=None)
    total_rows += len(chunk_df)

print(f"\nTotal rows in all chunks: {total_rows:,}")
print(f"Original dataframe rows: {df.shape[0]:,}")
print(f"Rows match: {total_rows == df.shape[0]}")
'''

Created 8 chunk files:
  variant_chunk_001.tsv: 289.49 MB
  variant_chunk_002.tsv: 293.78 MB
  variant_chunk_003.tsv: 292.41 MB
  variant_chunk_004.tsv: 288.66 MB
  variant_chunk_005.tsv: 288.50 MB


'\ntotal_rows = 0\nfor chunk_file in chunk_files:\n    file_path = os.path.join(chunk_dir, chunk_file)\n    chunk_df = pd.read_csv(file_path, sep=\'\t\', header=None)\n    total_rows += len(chunk_df)\n\nprint(f"\nTotal rows in all chunks: {total_rows:,}")\nprint(f"Original dataframe rows: {df.shape[0]:,}")\nprint(f"Rows match: {total_rows == df.shape[0]}")\n'

## Chunking Summary

Breaking down the large variant dataset into smaller chunks offers several advantages:

1. **Memory Efficiency**: Processing smaller chunks requires less memory, allowing analysis on machines with limited resources.
2. **Parallel Processing**: Chunks can be processed independently, enabling parallel computation.
3. **Fault Tolerance**: If processing fails for one chunk, others can still be processed.
4. **Storage Flexibility**: Smaller files can be more easily transferred and stored.

These chunks can now be used for downstream analysis, with each chunk containing approximately 10 million variants.