In [1]:
import pandas as pd
import random
import csv

def generate_species_data(output_file="species_data.csv"):
    # Define species with their chromosome patterns
    species_configs = {
        "Pan_troglodytes": {"chr_pattern": list(range(1, 23)) + ['X', 'Y'],
                           "size_factor": 0.95},  # 95% of human size
        "Gorilla_gorilla": {"chr_pattern": list(range(1, 23)) + ['X', 'Y'],
                           "size_factor": 0.92},
        "Mus_musculus": {"chr_pattern": list(range(1, 20)) + ['X', 'Y'],
                        "size_factor": 0.85},
        "Rattus_norvegicus": {"chr_pattern": list(range(1, 21)) + ['X', 'Y'],
                             "size_factor": 0.82},
        "Danio_rerio": {"chr_pattern": list(range(1, 26)),
                       "size_factor": 0.45},
        "Drosophila_melanogaster": {"chr_pattern": ['2L', '2R', '3L', '3R', '4', 'X'],
                                   "size_factor": 0.15},
        "Caenorhabditis_elegans": {"chr_pattern": list(range(1, 6)) + ['X'],
                                  "size_factor": 0.10},
        "Gallus_gallus": {"chr_pattern": list(range(1, 29)) + ['W', 'Z'],
                         "size_factor": 0.70},
        "Xenopus_tropicalis": {"chr_pattern": list(range(1, 11)),
                              "size_factor": 0.60},
        "Saccharomyces_cerevisiae": {"chr_pattern": list(range(1, 17)),
                                    "size_factor": 0.05}
    }

    species_data = []
    
    # Read reference human chromosome sizes
    ref_sizes = pd.read_csv("/Users/pranjalpruthi/Desktop/IGIB/sankey_tool/chromoviz/public/ref_chromosome_sizes.csv")
    
    for species, config in species_configs.items():
        for chr_id in config["chr_pattern"]:
            chr_name = f"chr{chr_id}" if isinstance(chr_id, int) else chr_id
            
            # Generate realistic chromosome size based on human reference
            base_size = random.randint(40000000, 250000000)
            chr_size = int(base_size * config["size_factor"])
            
            # Generate centromere positions
            centro_start = int(chr_size * 0.4)
            centro_end = centro_start + random.randint(2000000, 3000000)
            
            species_data.append({
                "species_name": species,
                "chr_id": chr_name,
                "chr_type": "chromosome",
                "chr_size_bp": chr_size,
                "centromere_start": centro_start,
                "centromere_end": centro_end
            })
    
    # Write to CSV
    df = pd.DataFrame(species_data)
    df.to_csv(output_file, index=False)
    return df

def generate_synteny_data(species_data_df, output_file="synteny_data.csv"):
    synteny_data = []
    ref_species = "Homo_sapiens"
    
    # Generate random synteny blocks for each species
    for species in species_data_df["species_name"].unique():
        species_chrs = species_data_df[species_data_df["species_name"] == species]["chr_id"].tolist()
        
        # Generate 3-5 synteny blocks per species
        num_blocks = random.randint(3, 5)
        
        for _ in range(num_blocks):
            query_chr = random.choice(species_chrs)
            ref_chr = f"chr{random.randint(1, 22)}"  # Reference chromosomes
            
            # Generate random positions
            start = random.randint(1000000, 50000000)
            length = random.randint(5000000, 15000000)
            end = start + length
            
            synteny_data.append({
                "query_name": species,
                "query_chr": query_chr,
                "query_start": start,
                "query_end": end,
                "query_strand": random.choice(["+", "-"]),
                "ref_chr": ref_chr,
                "ref_start": start,
                "ref_end": end,
                "ref_species": ref_species,
                "qry_lvl": "chromosome"
            })
    
    # Write to CSV
    df = pd.DataFrame(synteny_data)
    df.to_csv(output_file, index=False)
    return df

# Generate both datasets
species_df = generate_species_data()
synteny_df = generate_synteny_data(species_df)

print("Files generated successfully!")


Files generated successfully!


In [1]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('ref_gene_annotations.csv')

# Function to modify chromosome values
def add_chr_prefix(chrom):
    # Don't modify MT
    if chrom == 'MT':
        return chrom
    # Add 'chr' prefix to all other values
    return f'chr{chrom}'

# Apply the function to chromosome column
df['chromosome'] = df['chromosome'].apply(add_chr_prefix)

# Save the modified dataframe
df.to_csv('ref_gene_annotations_modified.csv', index=False)

In [4]:
import pandas as pd
import random

# Read chromosome sizes
chrom_sizes = pd.read_csv('ref_chromosome_sizes.csv')

# Create empty lists for our data
data = []
gene_classes = ['protein_coding', 'lncRNA', 'miRNA', 'pseudogene']
prefix_options = ['LINC', 'MIR', 'LOC', 'GENE', 'TP']

# For each chromosome
for _, chrom_row in chrom_sizes.iterrows():
    chrom = chrom_row['chromosome']
    max_size = chrom_row['size']
    
    # Skip MT chromosome
    if chrom == 'MT':
        continue
    
    # Calculate gene length based on chromosome size
    # Use 20% of chromosome size as maximum gene length
    max_gene_length = int(max_size * 0.2)
    min_gene_length = int(max_size * 0.1)  # 10% of chromosome size
    
    # Generate 5 genes per chromosome
    for i in range(5):
        # Generate start position leaving room for the gene length
        start = random.randint(1000, max_size - max_gene_length)
        
        # Generate gene length between min and max
        length = random.randint(min_gene_length, max_gene_length)
        end = start + length
        
        # Ensure end doesn't exceed chromosome size
        end = min(end, max_size)
        
        # Generate random gene details
        gene_class = random.choice(gene_classes)
        prefix = random.choice(prefix_options)
        number = random.randint(1000, 9999)
        symbol = f"{prefix}{number}"
        
        data.append({
            'chromosome': chrom,
            'genomic_accession': f'NC_{random.randint(100000,999999)}.11',
            'start': start,
            'end': end,
            'strand': random.choice(['+', '-']),
            'class': gene_class,
            'locus_tag': '',
            'symbol': symbol,
            'name': f'Test Gene {random.randint(1,999)}',
            'GeneID': random.randint(100000, 999999)
        })

# Create DataFrame and sort by chromosome and start position
df = pd.DataFrame(data)
df = df.sort_values(['chromosome', 'start'])

# Save to CSV
df.to_csv('ref_gene_annotations.csv', index=False)