<a href="https://colab.research.google.com/github/rukmals/Phylogenetic-Trees/blob/main/Phylogenetic_trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Loading the data sheet


In [1]:
import pandas as pd
df = pd.read_excel('/content/drive/MyDrive/Bioinformatics/protein_tables.xlsx', sheet_name = None) 

## check datasheet informations

In [2]:
df.keys()

dict_keys(['NZ_CP014692.1 ', 'NZ_CP023657.1', 'NZ_CP022699.1', 'NZ_CP014687.1', 'NZ_LN606600.1', 'NZ_CP011120.1', 'NZ_CP015164.1', 'NZ_CP015168.1', 'NZ_CP021524.1', 'NZ_CP022374.1', 'NZ_AP018515.1', 'NZ_CP023189.1', 'NC_017100.1', 'NC_017121.1', 'NC_017125.1', 'NC_017146.1', 'NZ_LN609302.1', 'NC_017111.1', 'NC_017150.1', 'NC_017108.1', 'NZ_AP014881.1'])

In [3]:
len(df.keys())

21

In [4]:
df['NZ_CP014692.1 ']['Protein name']

0        nucleoside-diphosphate sugar epimerase
1                           glycosyltransferase
2       lipopolysaccharide biosynthesis protein
3                          hypothetical protein
4        twin-arginine translocase subunit TatB
                         ...                   
3257       AarF/ABC1/UbiB kinase family protein
3258               M3 family oligoendopeptidase
3259                        elongation factor 4
3260    lipopolysaccharide biosynthesis protein
3261         5,6-dimethylbenzimidazole synthase
Name: Protein name, Length: 3262, dtype: object

### Algorithm for find the common bacteria sets for our protein set 
##### extract start and stop loations for each protein set in the every sheet
##### If a given protein is present in more than one location on the same genome (multiple entries in the protein table of a given species), take the first occurrence (earliest row).

In [5]:
protein_list = ["TonB-dependent receptor", "LysR family transcriptional regulator", "helix-turn-helix domain-containing protein", "efflux transporter outer membrane subunit"]
sheet_list = df.keys()

common_bacteria_set = []
for sheet_name in sheet_list:
  sheet = df[sheet_name]
  protein_names = list(sheet['Protein name'])
  count = 0
  start_list = []
  stop_list = []
  info_dict = {}
  
  for protein in protein_list:
    start_ = []
    stop_ = []
    for proteinInsheet in protein_names:
      if (protein in proteinInsheet):
        position = protein_names.index(proteinInsheet)
        start = sheet.loc[position]['Start']
        stop = sheet.loc[position]['Stop']
        start_.append(start)
        stop_.append(stop)

    start_list.append(start_[0])
    stop_list.append(stop_[0])
        
    
  if len(start_list)==4 and len(stop_list)==4:
    info_dict["Bacteria"] = sheet_name
    info_dict["Start Positions"] = start_list
    info_dict["Stop Positions"] = stop_list
    print(info_dict)
    common_bacteria_set.append(info_dict)
print(common_bacteria_set)


{'Bacteria': 'NZ_CP014692.1 ', 'Start Positions': [91315, 63174, 55106, 999992], 'Stop Positions': [93567, 64115, 57319, 1001515]}
{'Bacteria': 'NZ_CP023657.1', 'Start Positions': [211872, 333, 17020, 559625], 'Stop Positions': [214211, 1220, 17271, 561190]}
{'Bacteria': 'NZ_CP022699.1', 'Start Positions': [30077, 18042, 40006, 117718], 'Stop Positions': [32524, 18938, 40275, 119154]}
{'Bacteria': 'NZ_CP014687.1', 'Start Positions': [72020, 14424, 202443, 768608], 'Stop Positions': [74434, 15347, 202718, 770161]}
{'Bacteria': 'NZ_LN606600.1', 'Start Positions': [387110, 481992, 1311595, 136496], 'Stop Positions': [390220, 482918, 1312500, 138073]}
{'Bacteria': 'NZ_CP011120.1', 'Start Positions': [521501, 45285, 320717, 154424], 'Stop Positions': [523747, 45650, 321211, 155908]}
{'Bacteria': 'NZ_CP015164.1', 'Start Positions': [155346, 121923, 748077, 98], 'Stop Positions': [157766, 122864, 749357, 1621]}
{'Bacteria': 'NZ_CP015168.1', 'Start Positions': [148024, 248637, 1086506, 561625]

In [6]:
!pip install Bio

Collecting Bio
  Downloading bio-1.3.3-py3-none-any.whl (271 kB)
[?25l[K     |█▏                              | 10 kB 16.1 MB/s eta 0:00:01[K     |██▍                             | 20 kB 21.0 MB/s eta 0:00:01[K     |███▋                            | 30 kB 25.1 MB/s eta 0:00:01[K     |████▉                           | 40 kB 28.5 MB/s eta 0:00:01[K     |██████                          | 51 kB 31.5 MB/s eta 0:00:01[K     |███████▎                        | 61 kB 26.0 MB/s eta 0:00:01[K     |████████▍                       | 71 kB 23.5 MB/s eta 0:00:01[K     |█████████▋                      | 81 kB 23.9 MB/s eta 0:00:01[K     |██████████▉                     | 92 kB 23.9 MB/s eta 0:00:01[K     |████████████                    | 102 kB 25.2 MB/s eta 0:00:01[K     |█████████████▎                  | 112 kB 25.2 MB/s eta 0:00:01[K     |██████████████▌                 | 122 kB 25.2 MB/s eta 0:00:01[K     |███████████████▊                | 133 kB 25.2 MB/s eta 0:00:01

## Read the Fasta files and extract the sequnces 

In [9]:
from Bio import SeqIO
protein_list = ["TonB-dependent receptor", "LysR family transcriptional regulator", "helix-turn-helix domain-containing protein", "efflux transporter outer membrane subunit"]
homogeneus_gene_seq = []
for bact_details in common_bacteria_set:
  bacteria = bact_details['Bacteria'].strip()
  fasta = {}
  for seq_record in SeqIO.parse("/content/drive/MyDrive/Bioinformatics/FASTAs/"+bacteria+".fasta", "fasta"):
    #print(seq_record.seq[:20])
    #fasta["Bacteria"] = bact_details['Bacteria']
    fasta["id"] = seq_record.id
    fasta['description'] = seq_record.description
    fasta["TonB-dependent receptor"] = seq_record.seq[bact_details['Start Positions'][0]:bact_details['Stop Positions'][0]]
    fasta["LysR family transcriptional regulator"] = seq_record.seq[bact_details['Start Positions'][1]:bact_details['Stop Positions'][1]]
    fasta["helix-turn-helix domain-containing protein"] = seq_record.seq[bact_details['Start Positions'][2]:bact_details['Stop Positions'][2]]
    fasta["efflux transporter outer membrane subunit"] = seq_record.seq[bact_details['Start Positions'][3]:bact_details['Stop Positions'][3]]
  print(fasta)
  homogeneus_gene_seq.append(fasta)

{'id': 'NZ_CP014692.1', 'description': 'NZ_CP014692.1 Acetobacter aceti strain TMW2.1153 chromosome, complete genome', 'TonB-dependent receptor': Seq('TAAAACCGCTGCGAAATGCCTGTCAGAATGGTCCGTCTCAGCCCGTACTGAGGC...CAT'), 'LysR family transcriptional regulator': Seq('TGCAGGGCGAAGCCAGCGATCTGAGATTTTTCATCAACCTCGTCGATGCGGGGA...TGA'), 'helix-turn-helix domain-containing protein': Seq('TGGGTCTGTTCGGCACCATTAAACCCGGGCTGAGGAAATCTCAGGCGGCTATTC...TGA'), 'efflux transporter outer membrane subunit': Seq('CATGAGGGCGTTTTCCTGTTGGCTGACGCCGGGTAGATGGTTTCCCAGCCGCCT...CAA')}
{'id': 'NZ_CP023657.1', 'description': 'NZ_CP023657.1 Acetobacter pomorum strain BDGP5 chromosome, complete genome', 'TonB-dependent receptor': Seq('TGTCCTTTCGGATTATTGCGCCGTCCCGCCCAGTTTTGTCCCGACTGACATACC...TAA'), 'LysR family transcriptional regulator': Seq('TATGCTGACCCAAGGCGACCCGCCATAAAGTCAACAAAAGCTCGGATTTTGGGT...CAT'), 'helix-turn-helix domain-containing protein': Seq('TGTTCGACAGACACACGCCGTCAACGGCGCGCTACCTGCGCACTCATGAAGCCG...TGA'), 'efflux 

## check the extracted data list

In [10]:
print(homogeneus_gene_seq)

[{'id': 'NZ_CP014692.1', 'description': 'NZ_CP014692.1 Acetobacter aceti strain TMW2.1153 chromosome, complete genome', 'TonB-dependent receptor': Seq('TAAAACCGCTGCGAAATGCCTGTCAGAATGGTCCGTCTCAGCCCGTACTGAGGC...CAT'), 'LysR family transcriptional regulator': Seq('TGCAGGGCGAAGCCAGCGATCTGAGATTTTTCATCAACCTCGTCGATGCGGGGA...TGA'), 'helix-turn-helix domain-containing protein': Seq('TGGGTCTGTTCGGCACCATTAAACCCGGGCTGAGGAAATCTCAGGCGGCTATTC...TGA'), 'efflux transporter outer membrane subunit': Seq('CATGAGGGCGTTTTCCTGTTGGCTGACGCCGGGTAGATGGTTTCCCAGCCGCCT...CAA')}, {'id': 'NZ_CP023657.1', 'description': 'NZ_CP023657.1 Acetobacter pomorum strain BDGP5 chromosome, complete genome', 'TonB-dependent receptor': Seq('TGTCCTTTCGGATTATTGCGCCGTCCCGCCCAGTTTTGTCCCGACTGACATACC...TAA'), 'LysR family transcriptional regulator': Seq('TATGCTGACCCAAGGCGACCCGCCATAAAGTCAACAAAAGCTCGGATTTTGGGT...CAT'), 'helix-turn-helix domain-containing protein': Seq('TGTTCGACAGACACACGCCGTCAACGGCGCGCTACCTGCGCACTCATGAAGCCG...TGA'), 'efflu

## check the sequnces lengths for generate the distance matrix

In [11]:
for i in range(len(homogeneus_gene_seq)):
  tonb_seq = homogeneus_gene_seq[i]['TonB-dependent receptor']
  lysr_seq = homogeneus_gene_seq[i]['LysR family transcriptional regulator']
  helix_seq = homogeneus_gene_seq[i]['helix-turn-helix domain-containing protein']
  efflux_seq = homogeneus_gene_seq[i]['efflux transporter outer membrane subunit']
  print(len(tonb_seq),len(lysr_seq),len(helix_seq),len(efflux_seq))

2252 941 2213 1523
2339 887 251 1565
2447 896 269 1436
2414 923 275 1553
3110 926 905 1577
2246 365 494 1484
2420 941 1280 1523
2456 866 230 1529
2135 911 1280 1565
2246 365 494 1484
2405 914 578 1514
2210 893 263 1589
2159 365 494 1484
2159 365 494 1484
2159 365 494 1484
2159 365 494 1484
2228 929 269 1568
2159 365 494 1484
2159 365 494 1484
2159 365 494 1484
2159 365 494 1484


#### generate sequnces as text file for each protein 
#### the format for saving sequence is
#### > Accession ID
#### sequence from start to stop position
#### This will generate four files 

In [7]:
#file1.write(str_.decode())
x = 0
protein_list = ["TonB-dependent receptor", "LysR family transcriptional regulator", "helix-turn-helix domain-containing protein", "efflux transporter outer membrane subunit"]

In [13]:
for j in range(len(protein_list)):
  file1 = open("/content/sample_data/"+protein_list[j]+".txt","w")
  for i in range(len(homogeneus_gene_seq)):
    x +=1
    id = homogeneus_gene_seq[i]['id']
    tonb_seq = homogeneus_gene_seq[i][protein_list[j]]
    print(tonb_seq)
    str_1 = tonb_seq.encode()
    file1.write(">"+id+"\r\n")
    file1.write(str_1.decode()+"\r\n")
  #file1.write("\n")

TAAAACCGCTGCGAAATGCCTGTCAGAATGGTCCGTCTCAGCCCGTACTGAGGCGCGCCGACACCAATGCCTGAGCCACTGCGAAGCATGTAACGGCGATCGAAAAGGTTGGTAACATCCAGCCGCAGTTCAGTCCGTTTCAGGAACGGGGTATGGAACAGGTTGTCAAACGACTGAACCAGTCCAAGGTTGAAAGTGACGTACTGCGGCACGGATGCGCCGTTGGGGATCACTGAATCTTCGCGAAGACCACTGCCATAAACCATTGTTGCTGACAGACGCGTGGGATGATCTGTCTTGCGGAAGGCTGTGTAGCTGCCTCCTGCTGATGCGGTCCAGCGCTGGTCATGATCCACATGCACCCAGTGGTTGCTGATATAGGCAAGGTCGTCAGGCGTCATATTCCATTGGGCGGAAACAATGTCTTTTCCGATGGCGCGTGACCATGCGAAGTTTCCGTAAACAGTGAAGGGACCACGTGAATAGTCGGTGGTCACTTCATAGCCGTTGACCTGACCTCGTTTGTAATTGAACGCGGAGAGGATGATGGGAGACCCGAACTGTCCTTCGTCAATCATGTTGCGCGCCAGTTTGACATAGGCGTCGAAAGATGCGTGCCAGTGTGGCAGAATTTCCTGCGCGAAACCGGCGTCGAAGTAGTGGTCACGCTCAGCCTTGACCATCGTGTTCTGGAAATTCGGTGGCGCACCGGATGTGTTTGCGTATTTATTCAGCTGCGTTCCTCCGACCAGTTCAAACGGAGGCGGTGTGAAATAACGGGAATAACCGGCGTGGAAGGCGCCGCCTCTCCAGGGGTTCCAGACAATGTTGATTCTGGGGCTGACCTGCTTGGAATGTGTGTATTCATCCACACCATCGAAACGCAGGCCGTAATTGATGGTCAGGTTCTTGACGGGACGCCATTCGTCCTGAGCATACAATCCGTAGATCCAGCCGGTGCGGCCCGTGCCATCATGAATGGACTGTGTCTGGCCGTCGA



In [90]:
x

21