In [2]:
import random
import requests
import io
import re
from collections import defaultdict
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
import concurrent.futures

# The metadata file
with open("./data/metadata", "r") as file:
    metadata_lines = file.readlines()

lineage_samples = defaultdict(list)
unique_lineages = set()

# Prepare a list to hold selected samples
samples_list = []

# Counter for total files processed
total_files_processed = 0

for line in metadata_lines[1:]:
    columns = line.strip().split("\t")
    accession_id = columns[0]
    lineage = columns[1]
    lineage_samples[lineage].append(accession_id)
    unique_lineages.add(lineage)

# Ensure selected lineages have at least 500 samples
selected_lineages = [lineage for lineage in unique_lineages if len(lineage_samples[lineage]) >= 500][:10]

max_sequences_per_lineage = 500
max_sequences = max_sequences_per_lineage * len(selected_lineages)

# desired sequence length
DESIRED_LENGTH = 29809

# define download function
def download_sample(sample, lineage):
    url = f"https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id={sample}&format=fasta&style=raw&Retrieve=Retrieve"
    response = requests.get(url)
    if response.status_code == 200:
        filename = re.sub(r'[\\/:\*\?"<>\|()]', '_', sample)
        sequence = str(list(SeqIO.read(io.StringIO(response.text), "fasta").seq))
        # Count number of 'N's in sequence
        n_count = sequence.count('N')
        if n_count > 100:
            print(f"Skipping {filename} due to excessive 'N's")
            return None
        # adjust sequence length
        if len(sequence) > DESIRED_LENGTH:
            sequence = sequence[:DESIRED_LENGTH]  # chop off the end
        elif len(sequence) < DESIRED_LENGTH:
            sequence += 'N' * (DESIRED_LENGTH - len(sequence))  # pad with N's
        with io.open(f"./data/test/{filename}.fasta", "w", encoding="utf-8") as seq_file:
            seq_file.write(f'>{filename}\n{sequence}')
        # Append the data to the list without the sequence
        return {'id': sample, 'lineage': lineage}

with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    for lineage in tqdm(selected_lineages, desc="Processing lineages"):
        samples = random.sample(lineage_samples[lineage], max_sequences_per_lineage)
        lineage_samples[lineage] = list(set(lineage_samples[lineage]) - set(samples))
        futures = {executor.submit(download_sample, sample, lineage) for sample in samples}
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result:
                samples_list.append(result)
                total_files_processed += 1
                if total_files_processed >= 5000:
                    break
        if total_files_processed >= 5000:
            break

print("Download completed!")

# Convert the list of dictionaries to a dataframe
df = pd.DataFrame(samples_list)

# Save the dataframe to csv
df.to_csv("./data/dataset.csv", index=False)


Processing lineages:   0%|                                                                      | 0/10 [00:00<?, ?it/s]

Skipping _MW181734_ due to excessive 'N's
Skipping _OL980327_ due to excessive 'N's
Skipping _OL980367_ due to excessive 'N's
Skipping _OL980338_ due to excessive 'N's
Skipping _OL980379_ due to excessive 'N's
Skipping _MW181757_ due to excessive 'N's
Skipping _OK426777_ due to excessive 'N's
Skipping _OU166366_ due to excessive 'N's
Skipping _OK557254_ due to excessive 'N's
Skipping _MZ779291_ due to excessive 'N's
Skipping _OL979927_ due to excessive 'N's
Skipping _MW766879_ due to excessive 'N's
Skipping _OL980050_ due to excessive 'N's
Skipping _OL980333_ due to excessive 'N's
Skipping _OL980378_ due to excessive 'N's
Skipping _OK557617_ due to excessive 'N's
Skipping _MW181756_ due to excessive 'N's
Skipping _OU007598_ due to excessive 'N's
Skipping _OL980998_ due to excessive 'N's


Processing lineages:  10%|██████▏                                                       | 1/10 [00:13<01:57, 13.03s/it]

Skipping _OX245748_ due to excessive 'N's
Skipping _OX156091_ due to excessive 'N's
Skipping _OX221432_ due to excessive 'N's
Skipping _OX271738_ due to excessive 'N's
Skipping _OX221275_ due to excessive 'N's
Skipping _OX235237_ due to excessive 'N's
Skipping _OX219734_ due to excessive 'N's
Skipping _OX244800_ due to excessive 'N's
Skipping _OX216054_ due to excessive 'N's
Skipping _OX213660_ due to excessive 'N's
Skipping _OX261287_ due to excessive 'N's
Skipping _OX250053_ due to excessive 'N's
Skipping _OP059988_ due to excessive 'N's
Skipping _OX211295_ due to excessive 'N's
Skipping _OP113021_ due to excessive 'N's
Skipping _OX230741_ due to excessive 'N's
Skipping _OX228151_ due to excessive 'N's
Skipping _OX214409_ due to excessive 'N's
Skipping _OP121972_ due to excessive 'N's
Skipping _OX230252_ due to excessive 'N's
Skipping _OX212981_ due to excessive 'N's
Skipping _OX226915_ due to excessive 'N's
Skipping _OX054456_ due to excessive 'N's
Skipping _OX237212_ due to excessi

Skipping _OX227128_ due to excessive 'N's
Skipping _OX217793_ due to excessive 'N's
Skipping _OP047772_ due to excessive 'N's
Skipping _OX253626_ due to excessive 'N's
Skipping _OX156394_ due to excessive 'N's
Skipping _OX150547_ due to excessive 'N's
Skipping _OX146253_ due to excessive 'N'sSkipping _OP171834_ due to excessive 'N's
Skipping _OX247968_ due to excessive 'N's

Skipping _OX237626_ due to excessive 'N's
Skipping _OX273512_ due to excessive 'N's
Skipping _OP120376_ due to excessive 'N's
Skipping _OX217540_ due to excessive 'N's
Skipping _OX259518_ due to excessive 'N's
Skipping _OX236049_ due to excessive 'N's
Skipping _OX263759_ due to excessive 'N's
Skipping _OP122641_ due to excessive 'N'sSkipping _OX103708_ due to excessive 'N's

Skipping _OX230871_ due to excessive 'N's
Skipping _OX231638_ due to excessive 'N's
Skipping _OX242497_ due to excessive 'N's
Skipping _OX129622_ due to excessive 'N's
Skipping _OX141080_ due to excessive 'N's
Skipping _OX113673_ due to excessi

Processing lineages:  20%|████████████▍                                                 | 2/10 [00:29<02:00, 15.05s/it]

Skipping _OX242044_ due to excessive 'N's
Skipping _OX258695_ due to excessive 'N's
Skipping _OV651611_ due to excessive 'N's
Skipping _OV476794_ due to excessive 'N's
Skipping _OW297736_ due to excessive 'N's
Skipping _OV427445_ due to excessive 'N's
Skipping _OV972543_ due to excessive 'N's
Skipping _OW300083_ due to excessive 'N's
Skipping _OW705695_ due to excessive 'N's
Skipping _OV882843_ due to excessive 'N'sSkipping _OV782651_ due to excessive 'N's
Skipping _OW049509_ due to excessive 'N's

Skipping _OV889689_ due to excessive 'N's
Skipping _OW595908_ due to excessive 'N's
Skipping _OV889880_ due to excessive 'N's
Skipping _ON173266_ due to excessive 'N's
Skipping _OV882460_ due to excessive 'N's
Skipping _OW479947_ due to excessive 'N's
Skipping _OW470156_ due to excessive 'N's
Skipping _OW907530_ due to excessive 'N's
Skipping _OW149164_ due to excessive 'N's
Skipping _OV451267_ due to excessive 'N's
Skipping _OV882821_ due to excessive 'N's
Skipping _OW319694_ due to excessi

Skipping _OV634920_ due to excessive 'N's
Skipping _OV843397_ due to excessive 'N's
Skipping _OW850615_ due to excessive 'N's
Skipping _OW280867_ due to excessive 'N's
Skipping _ON190577_ due to excessive 'N'sSkipping _ON224982_ due to excessive 'N's

Skipping _OV507932_ due to excessive 'N's
Skipping _OW712170_ due to excessive 'N's
Skipping _OW798733_ due to excessive 'N's
Skipping _OM684808_ due to excessive 'N's
Skipping _OW020630_ due to excessive 'N's
Skipping _OW724618_ due to excessive 'N's
Skipping _OV275904_ due to excessive 'N's
Skipping _OV647587_ due to excessive 'N's
Skipping _OW495128_ due to excessive 'N's
Skipping _ON267449_ due to excessive 'N's
Skipping _ON266619_ due to excessive 'N's
Skipping _ON778972_ due to excessive 'N's
Skipping _ON623419_ due to excessive 'N's
Skipping _OV200400_ due to excessive 'N's
Skipping _OV321778_ due to excessive 'N's
Skipping _OW300368_ due to excessive 'N's
Skipping _OW703612_ due to excessive 'N's
Skipping _OX068488_ due to excessi

Skipping _OW182102_ due to excessive 'N's
Skipping _OV357551_ due to excessive 'N's
Skipping _OW971422_ due to excessive 'N'sSkipping _OX051612_ due to excessive 'N's

Skipping _OW782271_ due to excessive 'N'sSkipping _OW147996_ due to excessive 'N's

Skipping _OV700703_ due to excessive 'N's
Skipping _OW723788_ due to excessive 'N's
Skipping _OV843189_ due to excessive 'N's
Skipping _OW468895_ due to excessive 'N's
Skipping _OW461834_ due to excessive 'N's
Skipping _OW712093_ due to excessive 'N's
Skipping _OV734842_ due to excessive 'N's
Skipping _OV543972_ due to excessive 'N's
Skipping _OV116256_ due to excessive 'N's
Skipping _ON266705_ due to excessive 'N'sSkipping _OV883348_ due to excessive 'N's



Processing lineages:  30%|██████████████████▌                                           | 3/10 [00:50<02:03, 17.64s/it]

Skipping _OV944828_ due to excessive 'N's
Skipping _OW103380_ due to excessive 'N's
Skipping _MZ906640_ due to excessive 'N's
Skipping _MW773899_ due to excessive 'N's
Skipping _OM253449_ due to excessive 'N's
Skipping _MZ494411_ due to excessive 'N's
Skipping _MW994287_ due to excessive 'N's
Skipping _MT833963_ due to excessive 'N's
Skipping _MW773875_ due to excessive 'N's
Skipping _MW773867_ due to excessive 'N's
Skipping _MT831622_ due to excessive 'N's
Skipping _MT831228_ due to excessive 'N's
Skipping _MW773846_ due to excessive 'N's
Skipping _MW773902_ due to excessive 'N's
Skipping _MW773913_ due to excessive 'N's
Skipping _MT831849_ due to excessive 'N's
Skipping _MT834248_ due to excessive 'N's
Skipping _MT834122_ due to excessive 'N's
Skipping _MT831803_ due to excessive 'N's
Skipping _MW591472_ due to excessive 'N's
Skipping _MT831136_ due to excessive 'N's
Skipping _MT831775_ due to excessive 'N's
Skipping _MT831146_ due to excessive 'N's
Skipping _MT831388_ due to excessi

Processing lineages:  40%|████████████████████████▊                                     | 4/10 [01:04<01:37, 16.21s/it]

Skipping _ON085217_ due to excessive 'N's
Skipping _MT834194_ due to excessive 'N's
Skipping _MW735320_ due to excessive 'N's
Skipping _MW331647_ due to excessive 'N's
Skipping _MW506916_ due to excessive 'N's
Skipping _MW779712_ due to excessive 'N's
Skipping _MW369370_ due to excessive 'N's
Skipping _MW369388_ due to excessive 'N's
Skipping _MW582219_ due to excessive 'N's
Skipping _MW636869_ due to excessive 'N's
Skipping _MW406710_ due to excessive 'N's
Skipping _MW943195_ due to excessive 'N's
Skipping _MZ910367_ due to excessive 'N's
Skipping _MZ909991_ due to excessive 'N's
Skipping _MW349145_ due to excessive 'N's
Skipping _MW739008_ due to excessive 'N's
Skipping _MZ911120_ due to excessive 'N's
Skipping _OM434258_ due to excessive 'N's
Skipping _MZ570036_ due to excessive 'N's
Skipping _ON134719_ due to excessive 'N's
Skipping _MW490999_ due to excessive 'N's
Skipping _MW506905_ due to excessive 'N's
Skipping _MW506903_ due to excessive 'N's
Skipping _MW506913_ due to excessi

Skipping _OL825957_ due to excessive 'N's
Skipping _OK439962_ due to excessive 'N's
Skipping _OM181117_ due to excessive 'N's
Skipping _MW228282_ due to excessive 'N's
Skipping _MW812950_ due to excessive 'N's
Skipping _MW728160_ due to excessive 'N's
Skipping _MZ917626_ due to excessive 'N's
Skipping _MW331626_ due to excessive 'N's
Skipping _MW406698_ due to excessive 'N's
Skipping _MZ910264_ due to excessive 'N's
Skipping _MW491000_ due to excessive 'N's
Skipping _MW406711_ due to excessive 'N's
Skipping _MW331622_ due to excessive 'N's
Skipping _MZ842493_ due to excessive 'N's
Skipping _MZ297569_ due to excessive 'N's
Skipping _MW506891_ due to excessive 'N's
Skipping _OK465732_ due to excessive 'N's
Skipping _MW436749_ due to excessive 'N's
Skipping _MW491002_ due to excessive 'N's
Skipping _OM250767_ due to excessive 'N's
Skipping _ON216717_ due to excessive 'N's
Skipping _MZ908125_ due to excessive 'N's
Skipping _MW506895_ due to excessive 'N's
Skipping _MW629291_ due to excessi

Processing lineages:  50%|███████████████████████████████                               | 5/10 [01:21<01:22, 16.58s/it]

Skipping _MW589626_ due to excessive 'N's
Skipping _MZ914897_ due to excessive 'N's
Skipping _MZ266379_ due to excessive 'N's
Skipping _OA971564_ due to excessive 'N's
Skipping _OK079240_ due to excessive 'N's
Skipping _OL948455_ due to excessive 'N's
Skipping _OL950389_ due to excessive 'N's
Skipping _OK602952_ due to excessive 'N's
Skipping _OL775551_ due to excessive 'N's
Skipping _FR996670_ due to excessive 'N's
Skipping _OK404780_ due to excessive 'N's
Skipping _OK404852_ due to excessive 'N's
Skipping _OK568153_ due to excessive 'N's
Skipping _OL748893_ due to excessive 'N's
Skipping _OK409793_ due to excessive 'N's
Skipping _OL775643_ due to excessive 'N's
Skipping _OM181262_ due to excessive 'N's
Skipping _OK409765_ due to excessive 'N's
Skipping _OK387866_ due to excessive 'N's
Skipping _FR994797_ due to excessive 'N's
Skipping _OU413316_ due to excessive 'N's
Skipping _OA966749_ due to excessive 'N's
Skipping _OM272720_ due to excessive 'N's
Skipping _OM078100_ due to excessi

Skipping _OM099760_ due to excessive 'N's
Skipping _OL566906_ due to excessive 'N'sSkipping _OV237889_ due to excessive 'N's

Skipping _FR995912_ due to excessive 'N's
Skipping _OL927261_ due to excessive 'N's
Skipping _OL511141_ due to excessive 'N's
Skipping _OK601626_ due to excessive 'N's
Skipping _OM396486_ due to excessive 'N's
Skipping _MT419858_ due to excessive 'N's
Skipping _OL777428_ due to excessive 'N's
Skipping _OK237147_ due to excessive 'N's
Skipping _OL610384_ due to excessive 'N's
Skipping _OK579010_ due to excessive 'N's
Skipping _OK005541_ due to excessive 'N's
Skipping _OL751350_ due to excessive 'N's
Skipping _FR996047_ due to excessive 'N's
Skipping _OK238132_ due to excessive 'N's
Skipping _OL369591_ due to excessive 'N's
Skipping _MZ433835_ due to excessive 'N's
Skipping _OL922303_ due to excessive 'N's
Skipping _MZ533495_ due to excessive 'N's
Skipping _OK600323_ due to excessive 'N's
Skipping _OL564691_ due to excessive 'N's
Skipping _OK599276_ due to excessi

Processing lineages:  60%|█████████████████████████████████████▏                        | 6/10 [01:40<01:09, 17.39s/it]

Skipping _OK622684_ due to excessive 'N's
Skipping _OK226000_ due to excessive 'N's
Skipping _MZ211047_ due to excessive 'N's
Skipping _OW627618_ due to excessive 'N's
Skipping _OW618109_ due to excessive 'N's
Skipping _OW628431_ due to excessive 'N's
Skipping _OV854182_ due to excessive 'N's
Skipping _OV852871_ due to excessive 'N's
Skipping _OV849921_ due to excessive 'N's
Skipping _OW408400_ due to excessive 'N's
Skipping _OW436986_ due to excessive 'N's
Skipping _OW384193_ due to excessive 'N's
Skipping _OW446536_ due to excessive 'N's
Skipping _OW441530_ due to excessive 'N's
Skipping _OW499182_ due to excessive 'N's
Skipping _OV851234_ due to excessive 'N's
Skipping _OW439933_ due to excessive 'N's
Skipping _OW585561_ due to excessive 'N's
Skipping _OV856903_ due to excessive 'N's
Skipping _OW394318_ due to excessive 'N's
Skipping _OV849342_ due to excessive 'N's
Skipping _OW679007_ due to excessive 'N's
Skipping _OW529127_ due to excessive 'N's
Skipping _OW445780_ due to excessi

Skipping _OW418382_ due to excessive 'N's
Skipping _OW666122_ due to excessive 'N's
Skipping _OW568673_ due to excessive 'N's
Skipping _OW581104_ due to excessive 'N's
Skipping _OW651267_ due to excessive 'N's
Skipping _OW688845_ due to excessive 'N's
Skipping _OW396335_ due to excessive 'N's
Skipping _OW605701_ due to excessive 'N's
Skipping _OW390690_ due to excessive 'N's
Skipping _OV846898_ due to excessive 'N's
Skipping _OW669525_ due to excessive 'N's
Skipping _OW389467_ due to excessive 'N's
Skipping _OW434555_ due to excessive 'N's
Skipping _OW383001_ due to excessive 'N's
Skipping _OW602612_ due to excessive 'N's
Skipping _OW623589_ due to excessive 'N's
Skipping _OV885744_ due to excessive 'N's
Skipping _OW706884_ due to excessive 'N's
Skipping _OW413211_ due to excessive 'N's
Skipping _OV853578_ due to excessive 'N's
Skipping _OW391430_ due to excessive 'N's
Skipping _OW549595_ due to excessive 'N's
Skipping _OW391749_ due to excessive 'N's
Skipping _OW623755_ due to excessi

Processing lineages:  70%|███████████████████████████████████████████▍                  | 7/10 [01:56<00:50, 16.98s/it]

Skipping _OW405808_ due to excessive 'N's
Skipping _OK598881_ due to excessive 'N's
Skipping _MW084594_ due to excessive 'N'sSkipping _MW054132_ due to excessive 'N'sSkipping _MW084539_ due to excessive 'N's


Skipping _MW054134_ due to excessive 'N's
Skipping _ON110386_ due to excessive 'N's
Skipping _ON004075_ due to excessive 'N's
Skipping _MW084542_ due to excessive 'N's
Skipping _MW181535_ due to excessive 'N's
Skipping _MW420810_ due to excessive 'N's
Skipping _OK659345_ due to excessive 'N's
Skipping _MW577929_ due to excessive 'N's
Skipping _OK598775_ due to excessive 'N's
Skipping _OK465934_ due to excessive 'N's
Skipping _MW181495_ due to excessive 'N's
Skipping _MW084470_ due to excessive 'N's
Skipping _MW084599_ due to excessive 'N's
Skipping _MW084565_ due to excessive 'N's
Skipping _MW505086_ due to excessive 'N's
Skipping _MW084554_ due to excessive 'N's
Skipping _MW346359_ due to excessive 'N's
Skipping _MW181533_ due to excessive 'N's
Skipping _OK598584_ due to excessi

Skipping _MW054144_ due to excessive 'N'sSkipping _MW084589_ due to excessive 'N's

Skipping _MW210964_ due to excessive 'N's
Skipping _MW190187_ due to excessive 'N's
Skipping _MW421920_ due to excessive 'N's
Skipping _MW420331_ due to excessive 'N's
Skipping _MW080219_ due to excessive 'N's
Skipping _MW181521_ due to excessive 'N's
Skipping _MW172770_ due to excessive 'N's
Skipping _MW424925_ due to excessive 'N's
Skipping _MW172573_ due to excessive 'N's
Skipping _MW424942_ due to excessive 'N's
Skipping _MW181469_ due to excessive 'N's
Skipping _MW421958_ due to excessive 'N's
Skipping _MW054131_ due to excessive 'N's
Skipping _MT755602_ due to excessive 'N's
Skipping _MT755590_ due to excessive 'N's
Skipping _MW084549_ due to excessive 'N's
Skipping _MW084579_ due to excessive 'N's
Skipping _MW080248_ due to excessive 'N's
Skipping _MW054124_ due to excessive 'N's
Skipping _MW420320_ due to excessive 'N's
Skipping _MZ295102_ due to excessive 'N's
Skipping _MW053815_ due to excessi

Processing lineages:  80%|█████████████████████████████████████████████████▌            | 8/10 [02:09<00:31, 15.70s/it]

Skipping _OK598826_ due to excessive 'N's
Skipping _MT877375_ due to excessive 'N's
Skipping _MW421970_ due to excessive 'N's
Skipping _OA971144_ due to excessive 'N's
Skipping _OA969973_ due to excessive 'N's
Skipping _OU086133_ due to excessive 'N's
Skipping _OU009007_ due to excessive 'N's
Skipping _OU281670_ due to excessive 'N's
Skipping _OU198304_ due to excessive 'N's
Skipping _OA974444_ due to excessive 'N's
Skipping _OU165529_ due to excessive 'N's
Skipping _OA978886_ due to excessive 'N's
Skipping _OU078907_ due to excessive 'N's
Skipping _OU078300_ due to excessive 'N's
Skipping _OA974185_ due to excessive 'N's
Skipping _OE995415_ due to excessive 'N's
Skipping _OU077138_ due to excessive 'N's
Skipping _LR882268_ due to excessive 'N's
Skipping _OU093730_ due to excessive 'N's
Skipping _OE995541_ due to excessive 'N's
Skipping _OU282487_ due to excessive 'N's
Skipping _OU077646_ due to excessive 'N's
Skipping _OU086686_ due to excessive 'N's
Skipping _OW431017_ due to excessi

Skipping _OE995263_ due to excessive 'N's
Skipping _OU119383_ due to excessive 'N's
Skipping _OU077284_ due to excessive 'N's
Skipping _OU009337_ due to excessive 'N's
Skipping _OU081139_ due to excessive 'N's
Skipping _OU093774_ due to excessive 'N's
Skipping _OA972832_ due to excessive 'N's
Skipping _LR882167_ due to excessive 'N's
Skipping _LR862489_ due to excessive 'N's
Skipping _OU072672_ due to excessive 'N's
Skipping _OU007284_ due to excessive 'N's
Skipping _OU073073_ due to excessive 'N's
Skipping _OA974608_ due to excessive 'N's
Skipping _OU000525_ due to excessive 'N's
Skipping _MW808873_ due to excessive 'N's
Skipping _OU086492_ due to excessive 'N's
Skipping _OU174835_ due to excessive 'N's
Skipping _LR862459_ due to excessive 'N's
Skipping _OA974929_ due to excessive 'N's
Skipping _LR862456_ due to excessive 'N's
Skipping _OE996363_ due to excessive 'N's
Skipping _OA973697_ due to excessive 'N's
Skipping _OE999021_ due to excessive 'N's
Skipping _OE999350_ due to excessi

Processing lineages:  90%|███████████████████████████████████████████████████████▊      | 9/10 [02:24<00:15, 15.37s/it]

Skipping _LR882100_ due to excessive 'N's
Skipping _OC997157_ due to excessive 'N's
Skipping _OE995840_ due to excessive 'N's
Skipping _OE995351_ due to excessive 'N's
Skipping _OV001264_ due to excessive 'N's
Skipping _OV302988_ due to excessive 'N's
Skipping _OU834015_ due to excessive 'N's
Skipping _OU534039_ due to excessive 'N'sSkipping _OV318008_ due to excessive 'N's

Skipping _OV005169_ due to excessive 'N's
Skipping _OV510051_ due to excessive 'N's
Skipping _OV267517_ due to excessive 'N's
Skipping _OV248564_ due to excessive 'N's
Skipping _OU870895_ due to excessive 'N's
Skipping _OU715184_ due to excessive 'N's
Skipping _OU947169_ due to excessive 'N's
Skipping _OU990841_ due to excessive 'N's
Skipping _OU542915_ due to excessive 'N's
Skipping _OU908973_ due to excessive 'N's
Skipping _OU664535_ due to excessive 'N's
Skipping _OU541735_ due to excessive 'N's
Skipping _OV204847_ due to excessive 'N's
Skipping _OU761663_ due to excessive 'N's
Skipping _OV333020_ due to excessi

Skipping _OU856904_ due to excessive 'N's
Skipping _OV161325_ due to excessive 'N's
Skipping _OV558739_ due to excessive 'N's
Skipping _OV304304_ due to excessive 'N's
Skipping _OV629437_ due to excessive 'N's
Skipping _OV056580_ due to excessive 'N's
Skipping _OV150063_ due to excessive 'N's
Skipping _OV482575_ due to excessive 'N's
Skipping _OU788766_ due to excessive 'N'sSkipping _OU877566_ due to excessive 'N's

Skipping _OU671880_ due to excessive 'N's
Skipping _OV522399_ due to excessive 'N's
Skipping _OV553542_ due to excessive 'N's
Skipping _OV017880_ due to excessive 'N's
Skipping _OU983898_ due to excessive 'N's
Skipping _OU767623_ due to excessive 'N's
Skipping _OV027700_ due to excessive 'N's
Skipping _OV062162_ due to excessive 'N's
Skipping _OU873771_ due to excessive 'N's
Skipping _OV046698_ due to excessive 'N's
Skipping _OV173810_ due to excessive 'N's
Skipping _OV087628_ due to excessive 'N's
Skipping _OV126507_ due to excessive 'N's
Skipping _OU803795_ due to excessi

Skipping _OV175915_ due to excessive 'N's
Skipping _OV588930_ due to excessive 'N's
Skipping _OV097038_ due to excessive 'N's
Skipping _OV306394_ due to excessive 'N's
Skipping _OV145160_ due to excessive 'N's
Skipping _OV597059_ due to excessive 'N's
Skipping _OU984135_ due to excessive 'N's
Skipping _OV273202_ due to excessive 'N's
Skipping _OU868696_ due to excessive 'N's
Skipping _OU842321_ due to excessive 'N's
Skipping _OV269364_ due to excessive 'N's
Skipping _OU712208_ due to excessive 'N's
Skipping _OV384162_ due to excessive 'N's
Skipping _OV582497_ due to excessive 'N's
Skipping _OV546096_ due to excessive 'N's
Skipping _OU836208_ due to excessive 'N's
Skipping _OV292424_ due to excessive 'N's
Skipping _OV366950_ due to excessive 'N's
Skipping _OV391067_ due to excessive 'N's
Skipping _OV176109_ due to excessive 'N's
Skipping _OV294980_ due to excessive 'N's
Skipping _OU950580_ due to excessive 'N'sSkipping _OU783347_ due to excessive 'N's

Skipping _OV054489_ due to excessi

Processing lineages: 100%|█████████████████████████████████████████████████████████████| 10/10 [02:41<00:00, 16.17s/it]

Skipping _OU960246_ due to excessive 'N's
Skipping _OV415166_ due to excessive 'N's
Skipping _OV094665_ due to excessive 'N's
Skipping _OU524955_ due to excessive 'N's
Skipping _OU863869_ due to excessive 'N's
Skipping _OV038582_ due to excessive 'N's
Download completed!





In [5]:
import random
import requests
import io
import re
from collections import defaultdict
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
import concurrent.futures

# The metadata file
with open("./data/metadata", "r") as file:
    metadata_lines = file.readlines()

lineage_samples = defaultdict(list)
unique_lineages = set()

# Prepare a list to hold selected samples
samples_list = []

for line in metadata_lines[1:]:
    columns = line.strip().split("\t")
    accession_id = columns[0]
    lineage = columns[1]
    lineage_samples[lineage].append(accession_id)
    unique_lineages.add(lineage)

# Ensure selected lineages have at least 500 samples
selected_lineages = [lineage for lineage in unique_lineages if len(lineage_samples[lineage]) >= 500][:10]

max_sequences_per_lineage = 500
max_sequences = max_sequences_per_lineage * len(selected_lineages)

# define download function
def download_sample(sample, lineage):
    url = f"https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ena_sequence&id={sample}&format=fasta&style=raw&Retrieve=Retrieve"
    response = requests.get(url)
    if response.status_code == 200:
        filename = re.sub(r'[\\/:\*\?"<>\|()]', '_', sample)
        with io.open(f"./data/fasta_files/{filename}.fasta", "w", encoding="utf-8") as seq_file:
            seq_file.write(response.text)
        # Append the data to the list without the sequence
        return {'id': sample, 'lineage': lineage}

with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    for lineage in tqdm(selected_lineages, desc="Processing lineages"):
        samples = random.sample(lineage_samples[lineage], max_sequences_per_lineage)
        lineage_samples[lineage] = list(set(lineage_samples[lineage]) - set(samples))
        futures = {executor.submit(download_sample, sample, lineage) for sample in samples}
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result:
                samples_list.append(result)

print("Download completed!")

# Convert the list of dictionaries to a dataframe
df = pd.DataFrame(samples_list)

# Save the dataframe to csv
df.to_csv("./data/dataset.csv", index=False)

Mean length: 29809.7322


In [7]:
import pandas as pd
import os
from tqdm import tqdm

metadata = pd.read_csv('./data/metadata', delimiter='\t')
fasta_dir = './data/xes_files/'
fasta_files = os.listdir(fasta_dir)
fasta_ids = []
for file_name in tqdm(fasta_files):  
    if file_name.endswith('.fasta'):
        id_part = file_name.split('_')[1]
        fasta_ids.append(id_part)

subdata = metadata[metadata['id'].isin(fasta_ids)]

subdata.to_csv('sampled_data.csv', index=False)


  metadata = pd.read_csv('./data/metadata', delimiter='\t')
100%|█████████████████████████████████████████████████████████████████████████| 4982/4982 [00:00<00:00, 1251633.57it/s]
