In [None]:
Model_name = 'General_Model' # 'Escherichia_coli', 'Salmonella', 'Staphylococcus', 'General_Model', 'Escherichia_coli_General_Model', 'Salmonella_General_Model', 'Staphylococcus_General_Model'

Sourmash_threshold = 0.05

# Initialization

In [None]:
root_dir = '/content/drive/MyDrive/MRSA datasets'
table_dir = 'Tables/Models Tables/'
table_names = {'Escherichia_coli': 'Escherichia coli_Model_Table.csv',
              'Salmonella' : 'Salmonella enterica_Model_Table.csv',
              'Staphylococcus': 'Staphylococcus aureus_Model_Table.csv',
              'General_Model' : 'General_Model_Table.csv',
              'Escherichia_coli_General_Model': 'Escherichia coli_Species_Model_Table.csv',
              'Salmonella_General_Model' : 'Salmonella enterica_Species_Model_Table.csv',
              'Staphylococcus_General_Model': 'Staphylococcus aureus_Species_Model_Table.csv'}

homology_dir = 'Homology_splitting/'

sourmash_dir = {'Escherichia_coli': homology_dir + 'Sourmash/',
              'Salmonella' : homology_dir + 'Sourmash/',
              'Staphylococcus': homology_dir + 'Sourmash/',
              'General_Model' : homology_dir + 'Sourmash/',
              'Escherichia_coli_General_Model': homology_dir + 'Sourmash/',
              'Salmonella_General_Model' : homology_dir + 'Sourmash/',
              'Staphylococcus_General_Model': homology_dir + 'Sourmash/'}

final_table_dir = table_dir

In [None]:
!pip install sourmash

Collecting sourmash
  Downloading sourmash-4.9.4-py3-none-manylinux_2_28_x86_64.whl.metadata (9.6 kB)
Collecting screed<2,>=1.1.3 (from sourmash)
  Downloading screed-1.1.3.tar.gz (144 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.1/144.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting deprecation>=2.0.6 (from sourmash)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting bitstring<5,>=3.1.9 (from sourmash)
  Downloading bitstring-4.3.1-py3-none-any.whl.metadata (5.3 kB)
Collecting bitarray<4.0,>=3.0.0 (from bitstring<5,>=3.1.9->sourmash)
  Downloading bitarray-3.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading sourmash-4.9.4-py3-none-manylinux_2_28_x86_64.whl (5.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd /content/drive/MyDrive/MRSA\ datasets

Mounted at /content/drive
/content/drive/MyDrive/MRSA datasets


In [None]:
import pandas as pd
import networkx as nx
import math
import shlex
import numpy as np

# Sourmash

In [None]:
table_df = pd.read_csv(table_dir + table_names[Model_name])

# add column for path
table_df['path'] = table_df.apply(lambda sample: 'Datasets/' + sample['Species'] + '/' + sample['Dataset'] + '/' + sample['File Name'], axis = 1)

# save data (fasta) files path
table_df['path'].to_csv(f'{sourmash_dir[Model_name]}{Model_name}_filenames.txt', index=False, header=False)


In [None]:
len(table_df)

10194

In [None]:
filenames_file = f'{sourmash_dir[Model_name]}{Model_name}_filenames.txt'
sig_file = f'{sourmash_dir[Model_name]}{Model_name}_genomes.sig'
!sourmash sketch dna -p k=31,scaled=1000 --from-file {shlex.quote(filenames_file)} --output {shlex.quote(sig_file)}

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Kcalculated 1 signatures for 96 sequences in Datasets/Escherichia coli/PATRIC_e.coli/562.23144.fna
[K... reading sequences from Datasets/Salmonella enterica/PRJNA292666 & PRJNA292661/SRR2566923.fna
[Kcalculated 1 signatures for 220 sequences in Datasets/Salmonella enterica/PRJNA292666 & PRJNA292661/SRR2566923.fna
[K... reading sequences from Datasets/Staphylococcus aureus/PATRIC_genomes/1280.9259.fna
[Kcalculated 1 signatures for 68 sequences in Datasets/Staphylococcus aureus/PATRIC_genomes/1280.9259.fna
[K... reading sequences from Datasets/Salmonella enterica/PRJNA292666 & PRJNA292661/SRR1534879.fna
[Kcalculated 1 signatures for 46 sequences in Datasets/Salmonella enterica/PRJNA292666 & PRJNA292661/SRR1534879.fna
[K... reading sequences from Datasets/Staphylococcus aureus/PATRIC_genomes/1280.30197.fna
[Kcalculated 1 signatures for 53 sequences in Datasets/Staphylococcus aureus/PATRIC_genomes/1280.30197.fna
[K

In [None]:
npy_file = f'{sourmash_dir[Model_name]}{Model_name}_dist.npy'
!sourmash compare {shlex.quote(sig_file)} -o {shlex.quote(npy_file)}

In [None]:
labels = [l.strip() for l in open(npy_file + '.labels.txt')]
sim = np.load(npy_file)      # Similarity matrix
dist = 1 - sim                 # Convert to distance

edges = [(labels[i], labels[j])
         for i in range(len(labels))
         for j in range(i+1, len(labels))
         if dist[i, j] <= Sourmash_threshold]

G = nx.Graph()
G.add_nodes_from(labels)
G.add_edges_from(edges)
clusters = list(nx.connected_components(G))

In [None]:
def get_cluster(file_name, clusters):
    for i, cluster in enumerate(clusters):
        if file_name in cluster:
            return i
    return "None"

In [None]:
table_df['clusters'] = table_df['path'].apply(lambda row: get_cluster(row, clusters))
if 'path' in table_df.columns:
    table_df.drop('path', axis = 1, inplace = True)
table_df.to_csv(f"{final_table_dir}{table_names[Model_name][:-4]}_clusters.csv", index=False, sep=',', header=True, na_rep='N/A')

In [None]:
print(table_df['clusters'].value_counts())