In [11]:
import os
import csv
import requests
import numpy as np
from bs4 import BeautifulSoup
from Bio import Entrez, SeqIO

In [2]:
genoms = ['NC_017626', 'NC_017627', 'NC_013364', 'NC_013365', 'NC_013366', 'NC_013361', 'NC_013369', 
          'NC_013353', 'NC_013354', 'NC_018650', 'NC_018654', 'NC_018661', 'NC_018662', 'NC_018658', 
          'NC_018659', 'NC_018666', 'NC_008253', 'NC_011748', 'NC_017631', 'NC_008563', 'NC_009837', 
          'NC_009838', 'NC_020163', 'NC_010468', 'NC_012892', 'NC_012971', 'NC_013941', 'NC_017646', 
          'NC_004431', 'NC_017625', 'NC_017638', 'NC_011601', 'NC_009786', 'NC_009788', 'NC_009790', 
          'NC_009801', 'NC_011353', 'NC_011745', 'NC_002655', 'NC_007414', 'NC_017633', 'NC_012947', 
          'NC_009800', 'NC_011741', 'NC_011750', 'NC_017628', 'NC_007779', 'NC_010473', 'NC_012759', 
          'NC_020518', 'NC_000913', 'NC_016902', 'NC_016904', 'NC_017660', 'NC_011993', 'NC_022364', 
          'NC_017644', 'NC_017634', 'NC_017659', 'NC_017663', 'NC_022370', 'NC_012967', 'NC_017656', 
          'NC_017657', 'NC_011742', 'NC_011747', 'NC_011415', 'NC_011419', 'NC_013654', 'NC_013655', 
          'NC_010488', 'NC_010498', 'NC_002128', 'NC_002695', 'NC_013008', 'NC_013010', 'NC_017630', 
          'NC_017632', 'NC_011739', 'NC_011749', 'NC_011751', 'NC_017639', 'NC_017641', 'NC_017642', 
          'NC_017645', 'NC_007941', 'NC_007946', 'NC_017635', 'NC_017637', 'NC_017664', 'NC_017665', 
          'NC_017906', 'NC_017907', 'NC_017652', 'NC_017651']

# Cuvanje kompletnih sekvenci

In [130]:
for genom in genoms:
    # Kreiraj direktorijum ako ne postoji
    dir_path = f"./{genom}"
    os.makedirs(dir_path, exist_ok=True)

    # Preuzmi sekvencu i čuvaj je u FASTA formatu
    handle = Entrez.efetch(db="nucleotide", id=genom, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    SeqIO.write([record], os.path.join(dir_path, f"{genom}.fasta"), "fasta")
    handle.close()

print("Sekvence su preuzete i sačuvane.")

Sekvence su preuzete i sačuvane.


In [13]:
for genom in genoms:
    # Kreiraj direktorijum ako ne postoji
    dir_path = f"./{genom}"
    os.makedirs(dir_path, exist_ok=True)

    # Preuzmi sekvencu i čuvaj je u FASTA formatu
    handle = Entrez.efetch(db="nucleotide", id=genom, rettype="gb", retmode="text")
    record = SeqIO.read(handle, "genbank")
    SeqIO.write([record], os.path.join(dir_path, f"{genom}.gdb"), "genbank")
    handle.close()

print("Sekvence su preuzete i sačuvane.")


Sekvence su preuzete i sačuvane.


# Cuvanje patogenih ostrva

In [141]:
# Loop kroz sve genomove
for genom in genoms:
    url = f"http://www.paidb.re.kr/genome_pai.php?gma={genom}&m=g"

    response = requests.get(url)
    html = response.text

    soup = BeautifulSoup(html, 'html.parser')
    
    # PAIs
    pai_heading = soup.find('h4', text='• PAIs (PAI reported to be located in this genome)')

    if pai_heading:
        
        next_table = pai_heading.find_next('table')
        next_heading = pai_heading.find_next('h4')

        if next_table and (not next_heading or next_table.find_previous('h4') == pai_heading):
            
            pai_info = pai_heading.find_next('table')

            # Uzmi informacije za PAI
            pai_list = []
            pai_rows = pai_info.find_all('tr')[1:]
            for row in pai_rows:
                columns = row.find_all('td')
                pai_number = columns[0].text.strip()
                name = columns[1].text.strip()
                start = columns[2].text.strip()
                end = columns[3].text.strip()
                size = columns[4].text.strip()
                insertion_site = columns[5].text.strip()

                pai_info = {
                    "PAI Number": pai_number,
                    "Name": name,
                    "Start": start,
                    "End": end,
                    "Size": size,
                    "Insertion Site": insertion_site,
                }
                pai_list.append(pai_info)

            for pai_info in pai_list:
                print(f"PAI {pai_number} - {name}:")
                print(f"Start: {start}")
                print(f"End: {end}")
                print(f"Size: {size} bp")
                print(f"Insertion Site: {insertion_site}")
                print("\n")

                # Sacuvaj u csv formatu
                csv_file_path = os.path.join(genom, f"PAI_{genom}.csv")
                with open(csv_file_path, 'a', newline='') as csv_file:

                    if os.path.getsize(csv_file_path) == 0:
                        header = ["PAI Number", "Name", "Start", "End", "Size", "Insertion Site"]
                        np.savetxt(csv_file, [header], delimiter=",", fmt="%s")
                        
                    np.savetxt(csv_file, [list(pai_info.values())], delimiter=",", fmt="%s")

        else:
            print("Nema dostupnih PAI informacija za ovaj genom.")
    else:
        print("PAIs Not Found")

    # cPAIs
    cpai_heading = soup.find('h4', text='• cPAIs (PAI-like region overlapping genomic islands (candidate PAI))')

    if cpai_heading:

        next_table = cpai_heading.find_next('table')
        next_heading = cpai_heading.find_next('h4')
        if next_table and (not next_heading or next_table.find_previous('h4') == cpai_heading):
            
            cpai_info = cpai_heading.find_next('table')

            # Uzmi informacije za cPAI
            cpai_list = []
            cpai_rows = cpai_info.find_all('tr')[1:]
            for row in cpai_rows:
                columns = row.find_all('td')
                cpai_number = columns[0].text.strip()
                start = columns[1].text.strip()
                end = columns[2].text.strip()
                size = columns[3].text.strip()
                num_orfs = columns[4].text.strip()
                gc_content = columns[5].text.strip()

                cpai_info = {
                    "cPAI Number": cpai_number,
                    "Start": start,
                    "End": end,
                    "Size": size,
                    "No. of ORFs": num_orfs,
                    "G+C content": gc_content,
                }
                cpai_list.append(cpai_info)

            for cpai_info in cpai_list:
                print(f"cPAI {cpai_info['cPAI Number']}")
                print(f"Start: {cpai_info['Start']}")
                print(f"End: {cpai_info['End']}")
                print(f"Size: {cpai_info['Size']} bp")
                print(f"No. of ORFs: {cpai_info['No. of ORFs']}")
                print(f"G+C content: {cpai_info['G+C content']}")
                print("\n")

                # Sacuvaj u csv formatu
                csv_file_path = os.path.join(genom, f"cPAI_{genom}.csv")
                with open(csv_file_path, 'a', newline='') as csv_file:
                    # Write the header if the file is empty
                    if os.path.getsize(csv_file_path) == 0:
                        header = ["cPAI Number", "Start", "End", "Size", "No. of ORFs", "G+C content"]
                        np.savetxt(csv_file, [header], delimiter=",", fmt="%s")
                    # Write the data directly using NumPy's savetxt
                    np.savetxt(csv_file, [list(cpai_info.values())], delimiter=",", fmt="%s")

        else:
            print("Nema dostupnih cPAI informacija za ovaj genom.")
    else:
        print("cPAIs Not Found")

    # nPAIs
    npai_heading = soup.find('h4', text='• nPAIs (PAI-like region not overlapping genomic islands (non-probable PAI))')

    if npai_heading:

        next_table = npai_heading.find_next('table')
        next_heading = npai_heading.find_next('h4')

        if next_table and (not next_heading or next_table.find_previous('h4') == npai_heading):

            npai_info = npai_heading.find_next('table')

            # Uzmi informacije za nPAI
            npai_list = []
            npai_rows = npai_info.find_all('tr')[1:]
            for row in npai_rows:
                columns = row.find_all('td')
                npai_number = columns[0].text.strip()
                start = columns[1].text.strip()
                end = columns[2].text.strip()
                size = columns[3].text.strip()
                num_orfs = columns[4].text.strip()
                gc_content = columns[5].text.strip()

                npai_info = {
                    "nPAI Number": npai_number,
                    "Start": start,
                    "End": end,
                    "Size": size,
                    "No. of ORFs": num_orfs,
                    "G+C content": gc_content,
                }
                npai_list.append(npai_info)

            for npai_info in npai_list:
                print(f"nPAI {npai_info['nPAI Number']}")
                print(f"Start: {npai_info['Start']}")
                print(f"End: {npai_info['End']}")
                print(f"Size: {npai_info['Size']} bp")
                print(f"No. of ORFs: {npai_info['No. of ORFs']}")
                print(f"G+C content: {npai_info['G+C content']}")
                print("\n")

                # Save nPAI information to a CSV file in the existing directory
                csv_file_path = os.path.join(genom, f"nPAI_{genom}.csv")
                with open(csv_file_path, 'a', newline='') as csv_file:
                    # Write the header if the file is empty
                    if os.path.getsize(csv_file_path) == 0:
                        header = ["nPAI Number", "Start", "End", "Size", "No. of ORFs", "G+C content"]
                        np.savetxt(csv_file, [header], delimiter=",", fmt="%s")
                    # Write the data directly using NumPy's savetxt
                    np.savetxt(csv_file, [list(npai_info.values())], delimiter=",", fmt="%s")

        else:
            print("Nema dostupnih nPAI informacija za ovaj genom.")
    else:
        print("nPAIs Not Found")


Nema dostupnih PAI informacija za ovaj genom.
cPAI 1
Start: 233779
End: 275861
Size: 42083 bp
No. of ORFs: 39
G+C content: 50.64 %


cPAI 2
Start: 2272389
End: 2345178
Size: 72790 bp
No. of ORFs: 61
G+C content: 53.02 %


cPAI 3
Start: 3249573
End: 3277026
Size: 27454 bp
No. of ORFs: 33
G+C content: 35.86 %


cPAI 4
Start: 3387125
End: 3408657
Size: 21533 bp
No. of ORFs: 22
G+C content: 47.12 %


cPAI 5
Start: 3429556
End: 3495247
Size: 65692 bp
No. of ORFs: 62
G+C content: 48.4 %


cPAI 6
Start: 4347120
End: 4377124
Size: 30005 bp
No. of ORFs: 43
G+C content: 57.28 %


cPAI 7
Start: 4822309
End: 4852368
Size: 30060 bp
No. of ORFs: 32
G+C content: 49.55 %


cPAI 8
Start: 4881858
End: 4890349
Size: 8492 bp
No. of ORFs: 10
G+C content: 46.89 %


cPAI 9
Start: 4918128
End: 4927380
Size: 9253 bp
No. of ORFs: 14
G+C content: 49.62 %


cPAI 10
Start: 5081722
End: 5109850
Size: 28129 bp
No. of ORFs: 29
G+C content: 44.35 %


cPAI 11
Start: 5118217
End: 5152041
Size: 33825 bp
No. of ORFs: 53
G

Nema dostupnih PAI informacija za ovaj genom.
cPAI 1
Start: 18056
End: 30632
Size: 12577 bp
No. of ORFs: 13
G+C content: 46.62 %


Nema dostupnih nPAI informacija za ovaj genom.
Nema dostupnih PAI informacija za ovaj genom.
Nema dostupnih cPAI informacija za ovaj genom.
nPAI 1
Start: 50318
End: 62668
Size: 12351 bp
No. of ORFs: 10
G+C content: 46.51 %


Nema dostupnih PAI informacija za ovaj genom.
cPAI 1
Start: 232996
End: 270972
Size: 37977 bp
No. of ORFs: 36
G+C content: 50.49 %


cPAI 2
Start: 294319
End: 370904
Size: 76586 bp
No. of ORFs: 78
G+C content: 46.97 %


cPAI 3
Start: 1968450
End: 2016712
Size: 48263 bp
No. of ORFs: 34
G+C content: 53.64 %


cPAI 4
Start: 2084418
End: 2098998
Size: 14581 bp
No. of ORFs: 22
G+C content: 49.94 %


cPAI 5
Start: 3128084
End: 3232979
Size: 104896 bp
No. of ORFs: 107
G+C content: 47.68 %


cPAI 6
Start: 3947789
End: 4027549
Size: 79761 bp
No. of ORFs: 103
G+C content: 45.93 %


cPAI 7
Start: 4735147
End: 4836913
Size: 101767 bp
No. of ORFs: 1

Nema dostupnih PAI informacija za ovaj genom.
cPAI 1
Start: 2258500
End: 2307207
Size: 48708 bp
No. of ORFs: 33
G+C content: 53.87 %


cPAI 2
Start: 2318741
End: 2338469
Size: 19729 bp
No. of ORFs: 24
G+C content: 49.52 %


cPAI 3
Start: 2782365
End: 2794223
Size: 11859 bp
No. of ORFs: 7
G+C content: 43.76 %


cPAI 4
Start: 3339796
End: 3368066
Size: 28271 bp
No. of ORFs: 29
G+C content: 36.42 %


cPAI 5
Start: 3487188
End: 3586655
Size: 99468 bp
No. of ORFs: 97
G+C content: 48.34 %


cPAI 6
Start: 4303475
End: 4361459
Size: 57985 bp
No. of ORFs: 66
G+C content: 47.97 %


cPAI 7
Start: 4955741
End: 4982216
Size: 26476 bp
No. of ORFs: 24
G+C content: 46.81 %


cPAI 8
Start: 5129543
End: 5148134
Size: 18592 bp
No. of ORFs: 18
G+C content: 51.36 %


nPAI 1
Start: 1373564
End: 1386528
Size: 12965 bp
No. of ORFs: 13
G+C content: 51.59 %


nPAI 2
Start: 2586013
End: 2601642
Size: 15630 bp
No. of ORFs: 17
G+C content: 52.36 %


PAI 2 - PAI II CFT073:
Start: 4913367
End: 4971660
Size: 58294 bp

Nema dostupnih PAI informacija za ovaj genom.
Nema dostupnih cPAI informacija za ovaj genom.
nPAI 1
Start: 37557
End: 46431
Size: 8875 bp
No. of ORFs: 9
G+C content: 39.53 %


Nema dostupnih PAI informacija za ovaj genom.
cPAI 1
Start: 232196
End: 243511
Size: 11316 bp
No. of ORFs: 17
G+C content: 49.38 %


cPAI 2
Start: 2219283
End: 2313941
Size: 94659 bp
No. of ORFs: 83
G+C content: 52.7 %


cPAI 3
Start: 3283466
End: 3301855
Size: 18390 bp
No. of ORFs: 27
G+C content: 34.81 %


cPAI 4
Start: 3444844
End: 3522022
Size: 77179 bp
No. of ORFs: 68
G+C content: 51.79 %


cPAI 5
Start: 4213864
End: 4227278
Size: 13415 bp
No. of ORFs: 11
G+C content: 45.26 %


cPAI 6
Start: 4244675
End: 4264572
Size: 19898 bp
No. of ORFs: 25
G+C content: 47.97 %


cPAI 7
Start: 4817876
End: 4856836
Size: 38961 bp
No. of ORFs: 34
G+C content: 50.03 %


nPAI 1
Start: 2507428
End: 2523241
Size: 15814 bp
No. of ORFs: 16
G+C content: 52.19 %


nPAI 2
Start: 3372963
End: 3384429
Size: 11467 bp
No. of ORFs: 10
G+C

Nema dostupnih PAI informacija za ovaj genom.
cPAI 1
Start: 229399
End: 263882
Size: 34484 bp
No. of ORFs: 33
G+C content: 51.75 %


cPAI 2
Start: 694507
End: 706074
Size: 11568 bp
No. of ORFs: 16
G+C content: 45.44 %


cPAI 3
Start: 1798892
End: 1815714
Size: 16823 bp
No. of ORFs: 25
G+C content: 34.93 %


cPAI 4
Start: 4888127
End: 4899631
Size: 11505 bp
No. of ORFs: 14
G+C content: 46.48 %


nPAI 1
Start: 1644658
End: 1680494
Size: 35837 bp
No. of ORFs: 31
G+C content: 52.66 %


nPAI 2
Start: 2490526
End: 2506158
Size: 15633 bp
No. of ORFs: 18
G+C content: 52.23 %


Nema dostupnih PAI informacija za ovaj genom.
cPAI 1
Start: 232641
End: 275935
Size: 43295 bp
No. of ORFs: 40
G+C content: 49.79 %


cPAI 2
Start: 299507
End: 308014
Size: 8508 bp
No. of ORFs: 10
G+C content: 43.66 %


cPAI 3
Start: 2006592
End: 2051874
Size: 45283 bp
No. of ORFs: 28
G+C content: 54.18 %


cPAI 4
Start: 2298564
End: 2312547
Size: 13984 bp
No. of ORFs: 19
G+C content: 52.82 %


cPAI 5
Start: 3110179
End: 

Nema dostupnih PAI informacija za ovaj genom.
Nema dostupnih cPAI informacija za ovaj genom.
nPAI 1
Start: 83773
End: 93442
Size: 9670 bp
No. of ORFs: 10
G+C content: 46.5 %


Nema dostupnih PAI informacija za ovaj genom.
cPAI 1
Start: 1929861
End: 1979717
Size: 49857 bp
No. of ORFs: 34
G+C content: 53.75 %


cPAI 2
Start: 1991007
End: 2029778
Size: 38772 bp
No. of ORFs: 37
G+C content: 51.08 %


cPAI 3
Start: 3025585
End: 3081081
Size: 55497 bp
No. of ORFs: 45
G+C content: 48.04 %


cPAI 4
Start: 4554772
End: 4593927
Size: 39156 bp
No. of ORFs: 43
G+C content: 49.91 %


nPAI 1
Start: 1216784
End: 1226267
Size: 9484 bp
No. of ORFs: 9
G+C content: 52.14 %


nPAI 2
Start: 2231225
End: 2245216
Size: 13992 bp
No. of ORFs: 18
G+C content: 52.79 %


Nema dostupnih PAI informacija za ovaj genom.
Nema dostupnih cPAI informacija za ovaj genom.
nPAI 1
Start: 78342
End: 91765
Size: 13424 bp
No. of ORFs: 16
G+C content: 52.75 %


Nema dostupnih PAI informacija za ovaj genom.
cPAI 1
Start: 117847
E

Nema dostupnih PAI informacija za ovaj genom.
cPAI 1
Start: 229399
End: 263882
Size: 34484 bp
No. of ORFs: 32
G+C content: 51.76 %


cPAI 2
Start: 3177637
End: 3196106
Size: 18470 bp
No. of ORFs: 27
G+C content: 34.86 %


cPAI 3
Start: 4050584
End: 4062151
Size: 11568 bp
No. of ORFs: 17
G+C content: 45.44 %


cPAI 4
Start: 4767283
End: 4778787
Size: 11505 bp
No. of ORFs: 14
G+C content: 46.48 %


nPAI 1
Start: 2450021
End: 2465653
Size: 15633 bp
No. of ORFs: 16
G+C content: 52.23 %


nPAI 2
Start: 3311517
End: 3347353
Size: 35837 bp
No. of ORFs: 31
G+C content: 52.67 %


Nema dostupnih PAI informacija za ovaj genom.
Nema dostupnih cPAI informacija za ovaj genom.
nPAI 1
Start: 48306
End: 57962
Size: 9657 bp
No. of ORFs: 10
G+C content: 46.74 %


Nema dostupnih PAI informacija za ovaj genom.
cPAI 1
Start: 229399
End: 263882
Size: 34484 bp
No. of ORFs: 33
G+C content: 51.76 %


cPAI 2
Start: 3174216
End: 3189700
Size: 15485 bp
No. of ORFs: 25
G+C content: 34.14 %


cPAI 3
Start: 4047065
E