#### This is to obtain a list of genes that were tested by Feng et al. 2020,
#### and that were from the organisms of interest, such as Homo sapiens
#### All the process below are conducted on 12/29-30/2021
#### Results files were downloaded from Feng et al. 2020 pmid: 33026978 on 12/28/21

### 1. Setup

In [2]:
%load_ext autoreload
%autoreload

#### **General modules**

In [4]:
import os
import numpy as np
import pandas as pd
import importlib
from selenium import webdriver
import requests
from bs4 import BeautifulSoup

#### **Custom module(s)**

In [2]:
import My_WebScrape_Functions as ws

In [5]:
importlib.reload(ws)

<module 'My_WebScrape_Functions' from '/Users/ShokenLEE/Desktop/Codes/git/AH-search-in-NEproteins/My_WebScrape_Functions.py'>

#### **Global variables**

In [3]:
# URLs
url_ncbi = 'https://www.ncbi.nlm.nih.gov/'
url_ncbi_gene = url_ncbi + 'gene'
url_up = 'https://www.uniprot.org/'

# Organism
org = "homo_sapiens"

# directory
directory = '/Users/ShokenLEE/Desktop/DATA/Bioinformatics/MemBrain AH prediction/Results from the paper'

#### **Go to the directory**

In [4]:
os.chdir(directory)

In [5]:
os.getcwd()

'/Users/ShokenLEE/Desktop/DATA/Bioinformatics/MemBrain AH prediction/Results from the paper'

### 2. Run to make CSC lists from the result text files from the paper

In [6]:
files = os.listdir()
len(files)

11760

#### Below is for 0 to 2499

In [26]:
# prepare a dataframe and array for the ID not found in uniprot
df = pd.DataFrame()
not_found = []

for i, file in enumerate(files[:2500]):
    
    # get the uniprot ID from the file name
    uniprot_id = file.replace('.result', '')
    
    # get the organism and protein name
    # if not found, put the ID to 'NOT found' array and go to the next loop
    try:
        organism_name, protein_name = ws.searchForUniprotID_getProtein_Organism_Name(uniprot_id)
    except:
        not_found.append(uniprot_id)
        continue
        
    # open the file
    with open(file) as f:
        lines = f.readlines()
    
    # get the amino-acid sequence and prediction
    aa_sequence = lines[1]
    prediction = lines[3]
    
    # judge if the protein predicted to contain AH or not
    AH_or_Not = ''
    if '1' in prediction:
        ah_or_not = 'AH'
    else:
        ah_or_not = 'Non-AH'
        
    # put the values to the dataframe
    df.loc[i, 'Uniprot_ID'] = uniprot_id
    df.loc[i, 'Organism'] = organism_name
    df.loc[i, 'Protein_name'] = protein_name
    df.loc[i, 'AH_or_Not'] = ah_or_not
    df.loc[i, 'AA_sequence'] = aa_sequence
    df.loc[i, 'Prediction'] = prediction
    
    #log
    print(i, uniprot_id)
    
# save the results
df.to_csv('./Result_first_2500.csv')
df_not_found = pd.DataFrame(not_found, columns=['Uniprot_ID'])
df_not_found.to_csv('./Result_ID_not_found_2500.csv')

0 A0A023PXD5
1 A0A023PXD9
2 A0A023PXF8
3 A0A023PXG7
4 A0A023PXH4
5 A0A023PXJ3
6 A0A023PXK2
7 A0A023PXL1
8 A0A023PYC6
9 A0A023PYD7
10 A0A023PYE4
11 A0A023PYG1
12 A0A023PYI5
13 A0A023PYJ0
14 A0A023PYK2
15 A0A023PZ99
16 A0A023PZD0
17 A0A023PZE6
18 A0A023PZF9
19 A0A023PZG0
20 A0A023PZH4
21 A0A023PZJ3
22 A0A024FA41
23 A0A060A682
24 A0A084AFH0
25 A0A084B9Z2
26 A0A084B9Z5
27 A0A084R1J1
28 A0A087WTH5
29 A0A089FNE5
30 A0A096LP01
31 A0A097ZPE6
32 A0A098DDI4
33 A0A0A1GNF2
34 A0A0A2IBP6
35 A0A0B4K753
36 A0A0B5A051
37 A0A0B5EMG9
38 A0A0D1E6B3
39 A0A0D3MU35
40 A0A0D9SF12
41 A0A0E3D8L0
42 A0A0E3D8M2
43 A0A0E3D8N1
44 A0A0E3NFS5
45 A0A0F6B506
46 A0A0G2KQY6
47 A0A0H2XEK8
48 A0A0H2XG66
49 A0A0H2ZH12
50 A0A0H2ZHZ4
51 A0A0H2ZLQ1
52 A0A0H3AJF5
53 A0A0H3GGY3
54 A0A0H3GM48
55 A0A0H3JML2
56 A0A0H3JTK0
57 A0A0H3M5A8
58 A0A0H3M5L9
59 A0A0H3MD02
60 A0A0H3MGR4
61 A0A0K1IRS6
62 A0A0M3STV6
63 A0A0M3STX4
64 A0A0M4KRN2
65 A0A0N9HKQ7
66 A0A0P1ATX0
67 A0A0R4IGV4
68 A0A0U1LQF6
69 A0A0U1LR74
70 A0A0U1LSP7
71 A0A0U1RQ45
72

#### Below is for 2500 to 4999

In [9]:
# prepare a dataframe and array for the ID not found in uniprot
df = pd.DataFrame()
not_found = []

for i, file in enumerate(files[2500:5000]):
    
    # get the uniprot ID from the file name
    uniprot_id = file.replace('.result', '')
    
    # get the organism and protein name
    # if not found, put the ID to 'NOT found' array and go to the next loop
    try:
        organism_name, protein_name = ws.searchForUniprotID_getProtein_Organism_Name(uniprot_id)
    except:
        not_found.append(uniprot_id)
        continue
        
    # open the file
    with open(file) as f:
        lines = f.readlines()
    
    # get the amino-acid sequence and prediction
    aa_sequence = lines[1]
    prediction = lines[3]
    
    # judge if the protein predicted to contain AH or not
    AH_or_Not = ''
    if '1' in prediction:
        ah_or_not = 'AH'
    else:
        ah_or_not = 'Non-AH'
        
    # put the values to the dataframe
    df.loc[i, 'Uniprot_ID'] = uniprot_id
    df.loc[i, 'Organism'] = organism_name
    df.loc[i, 'Protein_name'] = protein_name
    df.loc[i, 'AH_or_Not'] = ah_or_not
    df.loc[i, 'AA_sequence'] = aa_sequence
    df.loc[i, 'Prediction'] = prediction
    
    #log
    print(i, uniprot_id)
    
# save the results
df.to_csv('../Result_2500_to_4999.csv')
df_not_found = pd.DataFrame(not_found, columns=['Uniprot_ID'])
df_not_found.to_csv('../Result_ID_not_found_2500_to_4999.csv')

0 O45306
1 O45435
2 O45731
3 O45870
4 O46101
5 O46383
6 O46512
7 O46598
8 O47476
9 O47478
10 O47492
11 O47493
12 O48472
13 O48528
14 O48529
15 O49668
16 O49730
17 O49814
18 O49931
19 O50500
20 O50501
21 O51039
22 O51042
23 O51051
24 O51055
25 O51058
26 O51063
27 O51067
28 O51068
29 O51073
30 O51083
31 O51090
32 O51100
33 O51144
34 O51145
35 O51253
36 O51266
37 O51280
38 O51356
39 O51451
40 O51595
41 O51597
42 O51750
43 O51876
44 O51880
45 O52043
46 O52044
47 O52069
48 O52214
49 O52216
50 O52351
51 O52619
52 O53176
53 O53368
54 O53505
55 O53508
56 O53857
57 O53945
58 O54101
59 O54189
60 O54569
61 O54701
62 O54885
63 O54901
64 O54912
65 O54980
66 O54990
67 O55034
68 O55242
69 O55246
70 O55247
71 O55653
72 O55654
73 O55703
74 O55704
75 O55705
76 O55708
77 O55714
78 O55715
79 O55728
80 O55733
81 O55744
82 O55746
83 O55747
84 O55762
85 O57211
86 O57254
87 O57295
88 O57604
89 O57713
90 O57781
91 O57953
92 O58728
93 O58759
94 O58967
95 O58968
96 O58981
97 O59179
98 O59276
99 O59708
100 O59712

#### Below is for 5000 to 6999

In [13]:
# prepare a dataframe and array for the ID not found in uniprot
df = pd.DataFrame()
not_found = []

for i, file in enumerate(files[5000:7000]):
    
    # get the uniprot ID from the file name
    uniprot_id = file.replace('.result', '')
    
    # get the organism and protein name
    # if not found, put the ID to 'NOT found' array and go to the next loop
    try:
        organism_name, protein_name = ws.searchForUniprotID_getProtein_Organism_Name(uniprot_id)
    except:
        not_found.append(uniprot_id)
        continue
        
    # open the file
    with open(file) as f:
        lines = f.readlines()
    
    # get the amino-acid sequence and prediction
    aa_sequence = lines[1]
    prediction = lines[3]
    
    # judge if the protein predicted to contain AH or not
    AH_or_Not = ''
    if '1' in prediction:
        ah_or_not = 'AH'
    else:
        ah_or_not = 'Non-AH'
        
    # put the values to the dataframe
    df.loc[i, 'Uniprot_ID'] = uniprot_id
    df.loc[i, 'Organism'] = organism_name
    df.loc[i, 'Protein_name'] = protein_name
    df.loc[i, 'AH_or_Not'] = ah_or_not
    df.loc[i, 'AA_sequence'] = aa_sequence
    df.loc[i, 'Prediction'] = prediction
    
    #log
    print(i, uniprot_id)
    
# save the results
df.to_csv('../Result_5000_to_6999.csv')
df_not_found = pd.DataFrame(not_found, columns=['Uniprot_ID'])
df_not_found.to_csv('../Result_ID_not_found_5000_to_6999.csv')

0 P52205
1 P52219
2 P52237
3 P52366
4 P52443
5 P52445
6 P52465
7 P52543
8 P52587
9 P52636
10 P52638
11 P52768
12 P52880
13 P52881
14 P52883
15 P52887
16 P52923
17 P53012
18 P53039
19 P53045
20 P53047
21 P53062
22 P53069
23 P53074
24 P53087
25 P53089
26 P53093
27 P53099
28 P53106
29 P53108
30 P53113
31 P53116
32 P53117
33 P53121
34 P53134
35 P53151
36 P53160
37 P53161
38 P53181
39 P53182
40 P53190
41 P53209
42 P53214
43 P53217
44 P53223
45 P53224
46 P53226
47 P53229
48 P53239
49 P53245
50 P53247
51 P53253
52 P53259
53 P53262
54 P53266
55 P53268
56 P53269
57 P53279
58 P53282
59 P53285
60 P53288
61 P53293
62 P53306
63 P53308
64 P53310
65 P53311
66 P53322
67 P53325
68 P53337
69 P53339
70 P53389
71 P53394
72 P53425
73 P53426
74 P53432
75 P53507
76 P53541
77 P53584
78 P53633
79 P53660
80 P53694
81 P53711
82 P53717
83 P53721
84 P53723
85 P53726
86 P53730
87 P53735
88 P53747
89 P53752
90 P53801
91 P53825
92 P53832
93 P53838
94 P53842
95 P53845
96 P53856
97 P53862
98 P53878
99 P53880
100 P53884

#### Below is for 7000 to 8999

In [7]:
# prepare a dataframe and array for the ID not found in uniprot
df = pd.DataFrame()
not_found = []

for i, file in enumerate(files[7000:9000]):
    
    # get the uniprot ID from the file name
    uniprot_id = file.replace('.result', '')
    
    # get the organism and protein name
    # if not found, put the ID to 'NOT found' array and go to the next loop
    try:
        organism_name, protein_name = ws.searchForUniprotID_getProtein_Organism_Name(uniprot_id)
    except:
        not_found.append(uniprot_id)
        continue
        
    # open the file
    with open(file) as f:
        lines = f.readlines()
    
    # get the amino-acid sequence and prediction
    aa_sequence = lines[1]
    prediction = lines[3]
    
    # judge if the protein predicted to contain AH or not
    AH_or_Not = ''
    if '1' in prediction:
        ah_or_not = 'AH'
    else:
        ah_or_not = 'Non-AH'
        
    # put the values to the dataframe
    df.loc[i, 'Uniprot_ID'] = uniprot_id
    df.loc[i, 'Organism'] = organism_name
    df.loc[i, 'Protein_name'] = protein_name
    df.loc[i, 'AH_or_Not'] = ah_or_not
    df.loc[i, 'AA_sequence'] = aa_sequence
    df.loc[i, 'Prediction'] = prediction
    
    #log
    print(i, uniprot_id)
    
# save the results
df.to_csv('../Result_7000_to_8999.csv')
df_not_found = pd.DataFrame(not_found, columns=['Uniprot_ID'])
df_not_found.to_csv('../Result_ID_not_found_7000_to_8999.csv')

0 Q25BH2
1 Q25BH5
2 Q25BH8
3 Q25BI2
4 Q25BI5
5 Q26261
6 Q26614
7 Q26896
8 Q27002
9 Q27003
10 Q27324
11 Q27367
12 Q27591
13 Q27963
14 Q27977
15 Q28173
16 Q28270
17 Q28487
18 Q28602
19 Q28612
20 Q28BP2
21 Q28C41
22 Q28CE7
23 Q28DI5
24 Q28FY5
25 Q28H54
26 Q28H62
27 Q28HF8
28 Q28IU1
29 Q28L40
30 Q28UC5
31 Q28UC6
32 Q29175
33 Q29441
34 Q29626
35 Q296J9
36 Q297K8
37 Q29980
38 Q29DG0
39 Q29RT8
40 Q2AB83
41 Q2EEM0
42 Q2EES1
43 Q2EMW0
44 Q2FDU4
45 Q2FFH9
46 Q2FH37
47 Q2FH55
48 Q2FI55
49 Q2FIP5
50 Q2FJ60
51 Q2FK46
52 Q2FK59
53 Q2FLB8
54 Q2FW71
55 Q2FZ95
56 Q2G026
57 Q2G1N0
58 Q2G991
59 Q2G9D3
60 Q2G9D4
61 Q2GD32
62 Q2GSI6
63 Q2GYB7
64 Q2H047
65 Q2HA14
66 Q2HAR0
67 Q2HCW8
68 Q2HDJ0
69 Q2HIM5
70 Q2HIW2
71 Q2HJ59
72 Q2HJ63
73 Q2HJ69
74 Q2HJ95
75 Q2HJA8
76 Q2HJB9
77 Q2HQL6
78 Q2HRD4
79 Q2HRD5
80 Q2HWK7
81 Q2HZ96
82 Q2I0M4
83 Q2I3H0
84 Q2IAL6
85 Q2IE76
86 Q2IWG4
87 Q2J6B3
88 Q2J8A4
89 Q2JAC3
90 Q2JAL9
91 Q2JI37
92 Q2JJ68
93 Q2JJG7
94 Q2JJZ7
95 Q2JLE8
96 Q2JN39
97 Q2JPK2
98 Q2JQR6
99 Q2JS37
100 Q2JWU5

#### Below is for 9000 to 9999

In [9]:
# prepare a dataframe and array for the ID not found in uniprot
df = pd.DataFrame()
not_found = []

for i, file in enumerate(files[9000:10000]):
    
    # get the uniprot ID from the file name
    uniprot_id = file.replace('.result', '')
    
    # get the organism and protein name
    # if not found, put the ID to 'NOT found' array and go to the next loop
    try:
        organism_name, protein_name = ws.searchForUniprotID_getProtein_Organism_Name(uniprot_id)
    except:
        not_found.append(uniprot_id)
        continue
        
    # open the file
    with open(file) as f:
        lines = f.readlines()
    
    # get the amino-acid sequence and prediction
    aa_sequence = lines[1]
    prediction = lines[3]
    
    # judge if the protein predicted to contain AH or not
    AH_or_Not = ''
    if '1' in prediction:
        ah_or_not = 'AH'
    else:
        ah_or_not = 'Non-AH'
        
    # put the values to the dataframe
    df.loc[i, 'Uniprot_ID'] = uniprot_id
    df.loc[i, 'Organism'] = organism_name
    df.loc[i, 'Protein_name'] = protein_name
    df.loc[i, 'AH_or_Not'] = ah_or_not
    df.loc[i, 'AA_sequence'] = aa_sequence
    df.loc[i, 'Prediction'] = prediction
    
    #log
    print(i, uniprot_id)
    
# save the results
df.to_csv('../Result_9000_to_9999.csv')
df_not_found = pd.DataFrame(not_found, columns=['Uniprot_ID'])
df_not_found.to_csv('../Result_ID_not_found_9000_to_9999.csv')

0 Q6FER0
1 Q6FFS6
2 Q6FJT9
3 Q6FL09
4 Q6FLC9
5 Q6FLM2
6 Q6FMZ2
7 Q6FN06
8 Q6FNE2
9 Q6FNJ6
10 Q6FNP6
11 Q6FNQ2
12 Q6FP25
13 Q6FQ03
14 Q6FQ42
15 Q6FQ45
16 Q6FR11
17 Q6FRR4
18 Q6FRT5
19 Q6FRX4
20 Q6FS06
21 Q6FS52
22 Q6FSZ7
23 Q6FTM9
24 Q6FTY5
25 Q6FU40
26 Q6FU42
27 Q6FV75
28 Q6FVS6
29 Q6FWD4
30 Q6FX62
31 Q6FX96
32 Q6FXH5
33 Q6FXJ3
34 Q6FYW3
35 Q6FYW5
36 Q6FYW8
37 Q6G1A8
38 Q6G2B2
39 Q6G6I7
40 Q6GA81
41 Q6GDZ1
42 Q6GHQ1
43 Q6GMF8
44 Q6GNM0
45 Q6GPH6
46 Q6GTX8
47 Q6GV17
48 Q6GV23
49 Q6GZQ6
50 Q6GZS3
51 Q6GZT3
52 Q6GZU2
53 Q6GZV5
54 Q6GZW4
55 Q6GZW5
56 Q6GZX1
57 Q6GZX3
58 Q6H1V1
59 Q6H3X3
60 Q6H6R9
61 Q6H7J6
62 Q6H7U3
63 Q6HHY7
64 Q6HP15
65 Q6ICI0
66 Q6ICY4
67 Q6IED8
68 Q6IMP4
69 Q6IQ85
70 Q6IQC7
71 Q6IUY1
72 Q6IWY1
73 Q6J163
74 Q6J9G1
75 Q6JWV8
76 Q6KZG0
77 Q6L147
78 Q6L4D2
79 Q6LP61
80 Q6LWM4
81 Q6LXA4
82 Q6LYT4
83 Q6LZU5
84 Q6M8W7
85 Q6MAK1
86 Q6MAK2
87 Q6MAK3
88 Q6MC92
89 Q6MDI5
90 Q6MGM3
91 Q6MGN5
92 Q6ML26
93 Q6ML54
94 Q6MN25
95 Q6MR11
96 Q6MUC0
97 Q6MWE5
98 Q6N8Y1
99 Q6N9G6
100 Q6NE75

#### Below is for 10000 to last

In [7]:
# prepare a dataframe and array for the ID not found in uniprot
df = pd.DataFrame()
not_found = []

for i, file in enumerate(files[10000:]):
    
    # get the uniprot ID from the file name
    uniprot_id = file.replace('.result', '')
    
    # get the organism and protein name
    # if not found, put the ID to 'NOT found' array and go to the next loop
    try:
        organism_name, protein_name = ws.searchForUniprotID_getProtein_Organism_Name(uniprot_id)
    except:
        not_found.append(uniprot_id)
        continue
        
    # open the file
    with open(file) as f:
        lines = f.readlines()
    
    # get the amino-acid sequence and prediction
    aa_sequence = lines[1]
    prediction = lines[3]
    
    # judge if the protein predicted to contain AH or not
    AH_or_Not = ''
    if '1' in prediction:
        ah_or_not = 'AH'
    else:
        ah_or_not = 'Non-AH'
        
    # put the values to the dataframe
    df.loc[i, 'Uniprot_ID'] = uniprot_id
    df.loc[i, 'Organism'] = organism_name
    df.loc[i, 'Protein_name'] = protein_name
    df.loc[i, 'AH_or_Not'] = ah_or_not
    df.loc[i, 'AA_sequence'] = aa_sequence
    df.loc[i, 'Prediction'] = prediction
    
    #log
    print(i, uniprot_id)
    
# save the results
df.to_csv('../Result_10000_to_last.csv')
df_not_found = pd.DataFrame(not_found, columns=['Uniprot_ID'])
df_not_found.to_csv('../Result_ID_not_found_10000_to_last.csv')

0 Q8N4K4
1 Q8N4S7
2 Q8N5G0
3 Q8N614
4 Q8N7C4
5 Q8N7S6
6 Q8N7X8
7 Q8N816
8 Q8N8F6
9 Q8N8F7
10 Q8N8V8
11 Q8N8Z6
12 Q8N9F0
13 Q8N9I5
14 Q8N9R8
15 Q8N9X5
16 Q8NA29
17 Q8NAC3
18 Q8NAN2
19 Q8NBP5
20 Q8NBR0
21 Q8NBS3
22 Q8NC44
23 Q8NCG7
24 Q8NCK7
25 Q8NCQ3
26 Q8NCS4
27 Q8NDB6
28 Q8NDY8
29 Q8NEA5
30 Q8NEQ5
31 Q8NET5
32 Q8NEW7
33 Q8NFN8
34 Q8NFR9
35 Q8NFT2
36 Q8NG04
37 Q8NH55
38 Q8NHP6
39 Q8NHS1
40 Q8NI17
41 Q8NI28
42 Q8NKE2
43 Q8NKW5
44 Q8NLB7
45 Q8NLK2
46 Q8NLN8
47 Q8NLQ9
48 Q8NLR0
49 Q8NN75
50 Q8NP09
51 Q8NQ73
52 Q8NQE4
53 Q8NSD6
54 Q8NTW4
55 Q8P8Z6
56 Q8PCJ7
57 Q8PCK6
58 Q8QHJ9
59 Q8QL14
60 Q8QL22
61 Q8QL30
62 Q8QL32
63 Q8QL45
64 Q8QL47
65 Q8R0C3
66 Q8R0M8
67 Q8R115
68 Q8R2H3
69 Q8R2Y2
70 Q8R373
71 Q8R404
72 Q8R411
73 Q8R412
74 Q8R4E1
75 Q8R4I7
76 Q8R4T9
77 Q8R664
78 Q8R666
79 Q8R6A3
80 Q8R6L9
81 Q8RFZ3
82 Q8RQL4
83 Q8RUS5
84 Q8RVC7
85 Q8RWG3
86 Q8RWL5
87 Q8RWM7
88 Q8RWN2
89 Q8RWQ5
90 Q8RWS1
91 Q8RWW0
92 Q8RX29
93 Q8RXE8
94 Q8RXF8
95 Q8RXL8
96 Q8RXP6
97 Q8RXS1
98 Q8RXW0
99 Q8RY60
100 Q8RY67

### 3. Concanatate the CSVs

In [22]:
df = pd.DataFrame()

directory = '/Users/ShokenLEE/Desktop/DATA/Bioinformatics/MemBrain AH prediction/All results_original'

filepaths = os.listdir(directory)
for filepath in filepaths:
    if filepath.endswith('csv'):
        _df = pd.read_csv(directory + '/' + filepath)
        df = pd.concat([df, _df], axis=0)

##### Check the shape of concanatated CSV

In [23]:
df.shape

(11760, 7)

##### Check the columns

In [24]:
df.columns

Index(['Unnamed: 0', 'Uniprot_ID', 'Organism', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction'],
      dtype='object')

##### Drop the unneccesary column and fill in null values

In [27]:
df = df.drop(['Unnamed: 0'], axis=1)

df.fillna('N/A', inplace=True)

In [44]:
df.shape

(11760, 6)

#### Expore to a CSV

In [28]:
df.to_csv('./Result_ALL.csv', index=False)

### 4. Sort the data based on the organisms

In [36]:
organisms = df['Organism'].unique()
organisms = organisms.tolist()
len(organisms)

1516

##### Export the organisms

In [32]:
df_org = pd.DataFrame(organisms, columns=['Organism'])
df_org.to_csv('../Organisms.csv')

#### Select the organisms based on manual scanning of the list above

In [43]:
organisms_of_interest = list(organisms[i] for i in [0, 5, 7, 8, 12, 20, 22, 46, 69])
organisms_of_interest

['Homo sapiens (Human)',
 'Danio rerio (Zebrafish) (Brachydanio rerio)',
 'Mus musculus (Mouse)',
 'Rattus norvegicus (Rat)',
 'Arabidopsis thaliana (Mouse-ear cress)',
 'Drosophila melanogaster (Fruit fly)',
 'Caenorhabditis elegans',
 'Gallus gallus (Chicken)',
 'Bos taurus (Bovine)']

In [60]:
df_selected_organisms = df[df['Organism'].isin(organisms_of_interest)]

#### Export

In [61]:
df_selected_organisms.to_csv('../Result_Selected_Organisms.csv', index=False)