In [5]:
import gzip
import os

In [4]:
import gzip

# Input and output file names
input_filename = 'mutation2pubtatorcentral.gz'
output_filename = 'snp_mutation_data.tsv'

# Open the gzip file and create the output TSV file
with gzip.open(input_filename, 'rt') as gz_file, open(output_filename, 'w') as tsv_file:
    # Write header to the TSV file (excluding the 'Type' column)
    tsv_file.write("PMID\tConcept ID\tMentions\tResource\n")
    
    # Process each line in the gzip file
    for line in gz_file:
        # Split the tab-separated values
        values = line.strip().split('\t')
        
        # Check if the row contains an RSID (SNP)
        if 'rs' in values[2]:
            # Replace empty columns with "None"
            values = ['None' if value == '' else value for value in values]
            
            # Write the filtered and cleaned data to the TSV file (excluding the 'Type' column)
            tsv_file.write(f"{values[0]}\t{values[2]}\t{values[3]}\t{values[4]}\n")

print(f"Filtered SNP Mutation data is stored in {output_filename}")

Filtered SNP Mutation data is stored in snp_mutation_data.tsv


In [4]:
#Stats

In [7]:
tsv_filename = 'snp_mutation_data.tsv'

# Initialize variables to store statistics
total_lines = 0
unique_pmids = set()
unique_snps = set()

with open(tsv_filename, 'r') as tsv_file:
    # Skip the header
    next(tsv_file)
    
    # Process each line in the file
    for line in tsv_file:
        values = line.strip().split('\t')
        
        # Increment the total RSID annotations count
        total_lines += 1
        
        # Add the PMID to the set of unique PMIDs
        unique_pmids.add(values[0])
        
        # Add the Concept ID (SNP) to the set of unique SNPs
        unique_snps.add(values[1])

# Calculated stats
print(f"Total RSID annotations: {total_lines}")
print(f"Total unique PMID: {len(unique_pmids)}")
print(f"Total unique SNPs: {len(unique_snps)}")

Total RSID annotations: 3069248
Total unique PMID: 507356
Total unique SNPs: 1118528


#### Code Summary
'''
The code processes the genetic mutation information. It extracts rows with RSID (SNP) annotations, 
excluding the "Type" column, and stores the filtered data in a TSV file. 

Then we conduct analysis on the filtered data, counting the total RSID annotations, unique PMID identifiers, and 
unique SNPs (Concept IDs). The results indicate 3,069,248 RSID annotations, 507,356 unique PMIDs, and 1,118,528 unique SNPs.
This above code efficiently filters and analyzes genetic mutation data.
'''

#### Data Description

i.   PMID:       PubMed abstract identifier. </br>
ii.  Concept ID: Corresponding database identifier (RSID). </br>
iii. Mentions:   Bio-concept mentions corresponding to the PubMed abstract. </br>
iv.  Resource:   Various manually annotated resources are included in the files (e.g., tmVar, dbSNP or ClinVar or combination).