In [None]:
import pandas as pd
import re

data = []
current_cluster = None
cluster_entries = []  # Temporarily store cluster data

with open("Main_Clustered_updated.clstr.clstr", "r") as clstr_file:
  for line in clstr_file.readlines():
      line = line.strip()

      if line.startswith(">Cluster"):  # Identify new cluster
          # Process the previous cluster before resetting
          if cluster_entries:
              # Find representative ID
              rep_id = next((entry[0] for entry in cluster_entries if entry[3] == "*"), None)
              # Update representative ID for all entries in the cluster
              for entry in cluster_entries:
                  entry[2] = rep_id  # Set representative ID
                  entry[3] = 100.0 if entry[3] == "*" else entry[3]  # Convert '*' to 100.0
                  data.append(entry)  # Append final entry to the dataset
              cluster_entries = []  # Reset for next cluster
          current_cluster = line.split()[1]  # Update cluster number

      elif line:  # Process sequence lines
          # Extract the sequence ID, handling cases with "tr|" or "sp|"
          match = re.search(r">(?:sp|tr)\|([^|]+)\|", line)  # Extract ID between "|"
          if not match:
              match = re.search(r">(.*?)\.\.\.", line)  # Fallback for other IDs

          if match:
              seq_id = match.group(1)
              identity_match = re.search(r"at (\d+\.\d+)%", line)  # Extract identity percentage
              identity = float(identity_match.group(1)) if identity_match else "*"  # Use "*" for rep initially

              # Store cluster data temporarily
              cluster_entries.append([seq_id, current_cluster, None, identity])

  # Process last cluster
  if cluster_entries:
      rep_id = next((entry[0] for entry in cluster_entries if entry[3] == "*"), None)
      for entry in cluster_entries:
          entry[2] = rep_id  # Assign representative ID
          entry[3] = 100.0 if entry[3] == "*" else entry[3]  # Convert '*' to 100.0
          data.append(entry)

clstr_df = pd.DataFrame(data, columns=["ID", "Cluster", "Representative_ID", "Cluster_Identity"])

clstr_df.to_csv("Main_clusters.csv", index=False)
clstr_df


Unnamed: 0,ID,Cluster,Representative_ID,Cluster_Identity
0,A0A6G7NUF9,0,NP_044190.1,71.27
1,AEK79911.1,0,NP_044190.1,61.95
2,AGT75466.1,0,NP_044190.1,61.83
3,AP_000051.1,0,NP_044190.1,99.51
4,APD78427.1,0,NP_044190.1,62.64
...,...,...,...,...
2959,WP_312465702.1,2148,WP_312465702.1,100.00
2960,A0A6G6XTG8,2149,A0A6G6XTG8,100.00
2961,U5PWF8,2150,U5PWF8,100.00
2962,A0A2M7RBY5,2151,A0A2M7RBY5,100.00


In [None]:
import pandas as pd

main = pd.read_csv("FULL_results+GenomeID+Tax.csv")

merged_df = main.merge(clstr_df[["ID", "Cluster", "Representative_ID", "Cluster_Identity"]], left_on="Hit", right_on="ID", how="left").drop(columns=["ID"])

merged_df.to_csv("FULL_results+GenomeID+Tax+Cluster.csv", index=False)

merged_df

Unnamed: 0,Hit,GenBankID,aln_hit,%I,P(H),E-value,Bit-Score,len(Qry),len(aln),%aln,...,phylum,class,order,family,genus,species,GenomeID,Cluster,Representative_ID,Cluster_Identity
0,AOC84064.1,AOC84064.1,352,99.148,1.000,0.000000,716.0,679,352,0.518409,...,Preplasmiviricota,Tectiliviricetes,Rowavirales,Adenoviridae,Aviadenovirus,Fowl aviadenovirus E,AOC84064.1,23,AOC84064.1,100.00
1,ANA50312.1,ANA50312.1,354,98.023,1.000,0.000000,711.0,679,353,0.519882,...,Preplasmiviricota,Tectiliviricetes,Rowavirales,Adenoviridae,Aviadenovirus,Fowl aviadenovirus E,ANA50312.1,23,AOC84064.1,77.78
2,XEQ86939.1,XEQ86939.1,374,99.465,1.000,0.000000,752.0,671,374,0.557377,...,Preplasmiviricota,Tectiliviricetes,Rowavirales,Adenoviridae,Mastadenovirus,Human adenovirus sp.,XEQ86939.1,2,AFD22004.1,79.49
3,QOV03173.1,QOV03173.1,378,72.487,1.000,0.000000,549.0,671,376,0.560358,...,Preplasmiviricota,Tectiliviricetes,Rowavirales,Adenoviridae,Mastadenovirus,Human mastadenovirus F,QOV03173.1,83,A0A7S6TZ10,100.00
4,AGT76236.1,AGT76236.1,442,74.661,1.000,0.000000,573.0,671,430,0.640835,...,Preplasmiviricota,Tectiliviricetes,Rowavirales,Adenoviridae,Mastadenovirus,Human mastadenovirus B,AGT76236.1,2,AFD22004.1,76.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3304,A0A1V5MQJ2,MWAL01000503,489,9.400,0.992,0.009964,85.0,559,380,0.679785,...,Bacteroidota,,,,,Bacteroidetes bacterium ADurb.Bin416,MWAL01000503,244,A0A1V5MQJ2,100.00
3305,A0A3B1EJS3,MF990902,445,10.100,0.961,0.009964,74.0,559,385,0.688730,...,,,,,,uncultured bacterium,MF990902,1656,A0A3B1EJS3,100.00
3306,A0A2I7RRS2,MG592590,540,11.200,0.933,0.009964,70.0,559,413,0.738819,...,Uroviricota,Caudoviricetes,,,,Vibrio phage 1.223.O._10N.261.48.A9,MG592590,1562,A0A2I7RRS2,100.00
3307,A0A6H0X6N1,MT259468,599,10.600,0.923,0.009964,69.0,559,387,0.692308,...,Uroviricota,Caudoviricetes,,Autographiviridae,,Aeromonas phage PS,MT259468,1529,A0A6H0X6N1,100.00


In [None]:
merged_df[merged_df["Cluster"].isna()]

Unnamed: 0,Hit,GenBankID,aln_hit,%I,P(H),E-value,Bit-Score,len(Qry),len(aln),%aln,...,phylum,class,order,family,genus,species,GenomeID,Cluster,Representative_ID,Cluster_Identity
221,2EX3_B,2EX3_B,196,100.000,1.000,3.110000e-140,403.0,266,196,0.736842,...,Uroviricota,Caudoviricetes,,Salasmaviridae,Salasvirus,Salasvirus phi29,,,,
514,ZHBPT9,ZHBPT9,182,15.385,1.000,4.230000e-46,160.0,163,160,0.981595,...,Uroviricota,Caudoviricetes,,Straboviridae,Tequatrovirus,Tequatrovirus T4,,,,
795,J8L4N5,,245,99.100,1.000,3.569000e-35,1041.0,245,245,1.000000,...,Bacillota,Bacilli,Bacillales,Bacillaceae,Bacillus,Bacillus cereus,,,,
930,UPI00006CA333,,196,100.000,1.000,8.702000e-27,1217.0,266,196,0.736842,...,Uroviricota,Caudoviricetes,,Salasmaviridae,Salasvirus,Salasvirus phi29,,,,
947,A0A4Q0MDR8,,379,65.400,1.000,1.564000e-25,565.0,679,378,0.556701,...,Thermodesulfobacteriota,Desulfovibrionia,Desulfovibrionales,Desulfovibrionaceae,Desulfovibrio,Desulfovibrio sp. DS-1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3193,UPI00006CA333,,195,14.300,1.000,6.370000e-03,122.0,255,185,0.725490,...,Uroviricota,Caudoviricetes,,Salasmaviridae,Salasvirus,Salasvirus phi29,,,,
3207,A0A2E3M3S3,,144,13.100,1.000,6.899000e-03,105.0,163,134,0.822086,...,Pseudomonadota,Alphaproteobacteria,Rickettsiales,,,Rickettsiales bacterium,,,,
3248,A0A495GE66,,498,9.800,0.992,8.323000e-03,85.0,559,375,0.670841,...,Pseudomonadota,Betaproteobacteria,Burkholderiales,Burkholderiaceae,Paraburkholderia,Paraburkholderia sediminicola,,,,
3291,UPI001F146548,,645,10.000,0.817,9.525000e-03,62.0,559,510,0.912343,...,Uroviricota,Caudoviricetes,,,,Serratia phage KpZh_1,,,,


In [None]:
import pandas as pd

replace_df = pd.read_csv("id_replacement.csv")
replace_df.rename(columns={"UniProtID": "Representative_ID", "NEW_GenBankID": "Rep_Synonym_ID", "%I": "Synonym_Identity", "E-value": "Synomym_E-value", "Bit-Score": "Synonym_Bit-Score", "len(Qry)": "len(Qry)"}, inplace=True)
replace_df

main = pd.read_csv("FULL_results+GenomeID+Tax+Cluster.csv")

merged_df = main.merge(replace_df[["Representative_ID", "Rep_Synonym_ID", "Synonym_Identity", "Synomym_E-value", "Synonym_Bit-Score"]], on="Representative_ID", how="left")
merged_df

# Assigning the old GenBank representative ID to the "Synonym" ID column for those which already were GenBank entries
condition = merged_df["Rep_Synonym_ID"].isna() & merged_df["Cluster"].notna()
# Assign values where the condition is met
merged_df.loc[condition, "Rep_Synonym_ID"] = merged_df.loc[condition, "Representative_ID"]
merged_df.loc[condition, "Synonym_Identity"] = 100  # Assign 100 as a value
merged_df.loc[condition, "Synomym_E-value"] = 0     # Assign 0 as a value
# Bit-Score is not able to be calculated and, thus, is let as NaN
merged_df = merged_df.drop_duplicates()
merged_df.to_csv("FULL_results+GenomeID+Tax+Cluster+BLASTp_Synonyms.csv")
merged_df
duplicated_values = merged_df[merged_df["Hit"].duplicated()]["Hit"]
duplicated_values

Unnamed: 0,Hit
982,A0A4Q0MDR8
997,A0A4Q0MDR8
1007,UPI00006CA333
1013,A0A4Q0MC37
1016,A0A4Q0MC37
...,...
3273,A0A133CZY9
3279,Q9Z7G7
3280,Q9Z7G7
3281,Q9Z7G7


In [None]:
IPG_raw = pd.read_csv("IPG_results_2.tsv", sep="\t")
IPG_raw = IPG_raw[IPG_raw["Id"] != "Id"]
IPG_raw.rename(columns={"Id": "IPG_ID"}, inplace=True)
IPG_raw

merger = merged_df.merge(IPG_raw[["IPG_ID", "Protein"]], left_on="Rep_Synonym_ID", right_on="Protein", how="left").drop(columns=["Protein"])
merger["IPG_ID"] = merger["IPG_ID"].astype("float64")

IPG = pd.read_csv("IPG_results_FILTERED.csv")
IPG.rename(columns={"Id": "IPG_ID", "Nucleotide Accession": "Reference_GenomeID", "Start": "IPG_Start", "Stop": "IPG_Stop", "Strand": "IPG_Strand", "Protein": "IPG_Synonym", "Protein Name": "IPG_Function", "Organism": "IPG_Organism", "len": "IPG_len"}, inplace=True)
IPG[["IPG_ID", "IPG_Synonym", "IPG_len", "IPG_Function", "Reference_GenomeID", "IPG_Start", "IPG_Stop", "IPG_Strand", "IPG_Organism", "Assembly"]]

merger2 = merger.merge(IPG[["IPG_ID", "IPG_Synonym", "IPG_len", "IPG_Function", "Reference_GenomeID", "IPG_Start", "IPG_Stop", "IPG_Strand", "IPG_Organism", "Assembly"]], on="IPG_ID", how="left")
merger2 = merger2.drop_duplicates()
merger2.to_csv("FULL_results+GenomeID+Tax+Cluster+BLASTp_Synonyms+IPG.csv")
merger2

Unnamed: 0,Hit,GenBankID,aln_hit,%I,P(H),E-value,Bit-Score,len(Qry),len(aln),%aln,...,IPG_ID,IPG_Synonym,IPG_len,IPG_Function,Reference_GenomeID,IPG_Start,IPG_Stop,IPG_Strand,IPG_Organism,Assembly
0,AOC84064.1,AOC84064.1,352,99.148,1.000,0.000000,716.0,679,352,0.518409,...,115142531.0,AOC84064.1,2211.0,pTP protein,KX258422.1,11243.0,13453.0,-,Fowl aviadenovirus E,GCA_006436795.1
1,ANA50312.1,ANA50312.1,354,98.023,1.000,0.000000,711.0,679,353,0.519882,...,115142531.0,AOC84064.1,2211.0,pTP protein,KX258422.1,11243.0,13453.0,-,Fowl aviadenovirus E,GCA_006436795.1
2,XEQ86939.1,XEQ86939.1,374,99.465,1.000,0.000000,752.0,671,374,0.557377,...,27156601.0,AFD22004.1,5184.0,pre-terminal protein,JN880452.1,8420.0,13603.0,-,Simian adenovirus A1285,GCA_006446335.1
3,QOV03173.1,QOV03173.1,378,72.487,1.000,0.000000,549.0,671,376,0.560358,...,369451248.0,QOV03173.1,1707.0,terminal protein precursor pTP,MT790999.1,8827.0,10533.0,-,Human adenovirus 41,GCA_015244835.1
4,AGT76236.1,AGT76236.1,442,74.661,1.000,0.000000,573.0,671,430,0.640835,...,27156601.0,AFD22004.1,5184.0,pre-terminal protein,JN880452.1,8420.0,13603.0,-,Simian adenovirus A1285,GCA_006446335.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4218,A0A1V5MQJ2,MWAL01000503,489,9.400,0.992,0.009964,85.0,559,380,0.679785,...,141534802.0,OPZ95543.1,1611.0,Polygalacturonase,MWAL01000503.1,6.0,1616.0,+,Bacteroidetes bacterium ADurb.Bin416,GCA_002069345.1
4219,A0A3B1EJS3,MF990902,445,10.100,0.961,0.009964,74.0,559,385,0.688730,...,187629842.0,AWJ66275.1,1245.0,nitrous oxide reductase maturation protein,MF990902.1,4410.0,5654.0,+,uncultured bacterium,
4220,A0A2I7RRS2,MG592590,540,11.200,0.933,0.009964,70.0,559,413,0.738819,...,176261219.0,AUR96338.1,2475.0,coil containing protein,MG592590.1,28534.0,31008.0,+,Vibrio phage 1.223.O._10N.261.48.A9,GCA_003929855.1
4221,A0A6H0X6N1,MT259468,599,10.600,0.923,0.009964,69.0,559,387,0.692308,...,312459902.0,QIW89971.1,2571.0,hypothetical protein,MT259468.1,29952.0,32522.0,+,Aeromonas phage PS,GCA_012360715.1


In [None]:
import pandas as pd

non_viral_list = []

with open("NON-viral_genomes_DEF.txt", "r") as NV:
  for t in NV.readlines():
    non_viral_list.append(t.split("_PHROG")[0])
non_viral_list

nv_df = pd.DataFrame(non_viral_list, columns=["GenomeID_v"])
nv_df["Viral_Presence"] = "N"
nv_df

merger2 = pd.read_csv("FULL_results+GenomeID+Tax+Cluster+BLASTp_Synonyms+IPG.csv")
merger2["GenomeID_v"] = merger2["Reference_GenomeID"].str.split('.').str[0]
merger2

merger3 = merger2.merge(nv_df[["GenomeID_v", "Viral_Presence"]], on="GenomeID_v", how="left").drop(columns=["GenomeID_v"])
merger2 = merger2.drop(columns=["GenomeID_v"])
merger3[merger3["Viral_Presence"] == "N"]
merger3.to_csv("FULL_results+GenomeID+Tax+Cluster+BLASTp_Synonyms+IPG+Non-Viral.csv")

In [None]:
import pandas as pd

eggNOG = pd.read_csv("TP_annotation_eggNOG.tsv", sep="\t", comment="#")
eggNOG.rename(columns={"query": "IPG_Synonym", "Description": "eggNOG_Annotation"}, inplace=True)
eggNOG

merger4 = merger3.merge(eggNOG[["IPG_Synonym", "eggNOG_Annotation"]], on="IPG_Synonym", how="left")
# Switching position of eggNOG annotation to be next to IPG-yielded annotation
col = merger4.pop("eggNOG_Annotation")
position = len(merger4.columns) + (-7)
merger4.insert(position, "eggNOG_Annotation", col)
merger4

# Inclusion of eggNOG's novel families
eggNOVEL = pd.read_csv("TP_annotation_eggNOG_novel.tsv", sep="\t", comment="#")
eggNOVEL.rename(columns={"query": "IPG_Synonym", "novel_fam": "eggNOG_Annotation"}, inplace=True)
eggNOVEL = eggNOVEL[["IPG_Synonym", "eggNOG_Annotation"]]
eggNOVEL
merger5 = merger4.merge(eggNOVEL[["IPG_Synonym", "eggNOG_Annotation"]], on="IPG_Synonym", how="left")

# Merger of both eggNOG columns, priorising traditional annotation and, then deleting the redundantnewest one
condition2 = merger5["eggNOG_Annotation_x"].isna() & merger5["eggNOG_Annotation_y"].notna()
merger5.loc[condition2, "eggNOG_Annotation_x"] = merger5.loc[condition2, "eggNOG_Annotation_y"]
merger5 = merger5.drop(columns=["eggNOG_Annotation_y"])
merger5 = merger5.rename(columns={"eggNOG_Annotation_x": "eggNOG_Annotation"})

merger5.to_csv("FULL_results+GenomeID+Tax+Cluster+BLASTp_Synonyms+IPG+Non-Viral+eggNOG.csv")

In [None]:
import pandas as pd

orf_metrics_df = pd.read_csv("ORF_metrics_DEF+pol.tsv",sep="\t")
orf_metrics_df = orf_metrics_df.rename(columns={"Genome": "Genome_a", "Nr_ORF": "PHROG_Nr_ORF", "Min_Distance": "Min_Abs_Distance_TP", "Mean": "Mean_Distance_TP", "Median": "Median_Distance_TP"})
orf_metrics_df

#merger5 = pd.read_csv("FULL_results+GenomeID+Tax+Cluster+BLASTp_Synonyms+IPG+Non-Viral+eggNOG.csv")
merger5["Genome_a"] = merger5["Reference_GenomeID"].str.split('.').str[0]
merger6 = merger5.merge(orf_metrics_df[["Genome_a", "PHROG_Nr_ORF", "Min_Abs_Distance_TP", "Mean_Distance_TP", "Median_Distance_TP", "pPolB", "pPolB_distance", "Polymerase", "Pol_distance"]], on="Genome_a", how="left")
merger5 = merger5.drop(columns=["Genome_a"])
merger6 = merger6.drop(columns=["Genome_a", "Unnamed: 0"])
merger6.to_csv("FULL_results+GenomeID+Tax+Cluster+BLASTp_Synonyms+IPG+Non-Viral+eggNOG+Pol.csv")
merger6

Unnamed: 0,Hit,GenBankID,aln_hit,%I,P(H),E-value,Bit-Score,len(Qry),len(aln),%aln,...,Assembly,Viral_Presence,PHROG_Nr_ORF,Min_Abs_Distance_TP,Mean_Distance_TP,Median_Distance_TP,pPolB,pPolB_distance,Polymerase,Pol_distance
0,AOC84064.1,AOC84064.1,352,99.148,1.000,0.000000,716.0,679,352,0.518409,...,GCA_006436795.1,,2.0,1.0,-5.500000,-5.5,Y,-1.0,phrog_1907,-1.0
1,ANA50312.1,ANA50312.1,354,98.023,1.000,0.000000,711.0,679,353,0.519882,...,GCA_006436795.1,,2.0,1.0,-5.500000,-5.5,Y,-1.0,phrog_1907,-1.0
2,XEQ86939.1,XEQ86939.1,374,99.465,1.000,0.000000,752.0,671,374,0.557377,...,GCA_006446335.1,N,,,,,,,,
3,QOV03173.1,QOV03173.1,378,72.487,1.000,0.000000,549.0,671,376,0.560358,...,GCA_015244835.1,,1.0,5.0,-5.000000,-5.0,Y,-5.0,phrog_1907,-5.0
4,AGT76236.1,AGT76236.1,442,74.661,1.000,0.000000,573.0,671,430,0.640835,...,GCA_006446335.1,N,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3304,A0A1V5MQJ2,MWAL01000503,489,9.400,0.992,0.009964,85.0,559,380,0.679785,...,GCA_002069345.1,N,,,,,,,,
3305,A0A3B1EJS3,MF990902,445,10.100,0.961,0.009964,74.0,559,385,0.688730,...,,,6.0,1.0,8.333333,9.5,N,,-,
3306,A0A2I7RRS2,MG592590,540,11.200,0.933,0.009964,70.0,559,413,0.738819,...,GCA_003929855.1,,37.0,5.0,-7.486486,-12.0,N,,-,
3307,A0A6H0X6N1,MT259468,599,10.600,0.923,0.009964,69.0,559,387,0.692308,...,GCA_012360715.1,,24.0,1.0,-11.333333,-10.5,N,,phrog_9556,-26.0


In [None]:
# Addition of COG Categories

import pandas as pd

merger6 = pd.read_csv("FULL_results+GenomeID+Tax+Cluster+BLASTp_Synonyms+IPG+Non-Viral+eggNOG+Pol.csv")
eggNOG = pd.read_csv("TP_annotation_eggNOG.tsv", sep="\t", comment="#")
eggNOG.rename(columns={"query": "IPG_Synonym", "COG_category": "COG_Category"}, inplace=True)
eggNOG

merger7 = merger6.merge(eggNOG[["IPG_Synonym", "COG_Category"]], on="IPG_Synonym", how="left")
merger7.keys()
col = merger7.pop("COG_Category")
position = len(merger7.columns) + (-15)
merger7.insert(position, "COG_Category", col)
merger7.to_csv("FULL_results+GenomeID+Tax+Cluster+BLASTp_Synonyms+IPG+Non-Viral+eggNOG+Pol+COG.csv", index=False)
