-
Notifications
You must be signed in to change notification settings - Fork 49
/
search.py
1763 lines (1663 loc) · 68.7 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""This file contains the search classes for the search feature.
"""
import numpy as np
import os
import re
import requests
import sys
import time
import urllib
import pandas as pd
import xml.etree.ElementTree as Et
from json import JSONDecodeError
from tqdm import tqdm
from .exceptions import IncorrectFieldException
from .exceptions import MissingQueryException
from .utils import scientific_name_to_taxid
from .utils import requests_3_retries
SEARCH_REQUEST_TIMEOUT = 20
SRA_SEARCH_GROUP_SIZE = 300
class QuerySearch:
"""This is the base class for the search feature.
This class takes as input the user's search query, which has been
tokenized by the ArgParser. The query will be sent to either SRA or ENA
depending on the user's input, and the results will be returned as a
pandas dataframe.
Attributes
----------
self.df: Pandas DataFrame
The search result belonging to this search instance
Parameters
----------
verbosity : integer
The level of details of the search result.
return_max : int
The maximum number of entries to be returned.
query : str
The main query string.
accession : str
A relevant study / experiment / sample / run accession number.
organism : str
Scientific name of the sample organism
layout : str
Library layout. Possible inputs: single, paired
mbases : int
Size of the sample of interest rounded to the nearest megabase.
publication_date : str
The publication date of the run in the format dd-mm-yyyy. If a
date range is desired, input should be in the format of
dd-mm-yyyy:dd-mm-yyyy
platform : str
Sequencing platform used for the run. Some possible inputs include:
illumina, ion torrent, oxford nanopore
selection : str
Library selection. Some possible inputs: cdna, chip, dnase, pcr
source : str
Library source. Some possible inputs: genomic, metagenomic,
transcriptomic
strategy : str
Library Preparation strategy. Some possible inputs: wgs, amplicon,
rna seq
title : str
Title of the experiment associated with the run
suppress_validation: bool
Defaults to False. If this is set to True, the user input format
checks will be skipped.
Setting this to True may cause the program to behave in unexpected
ways, but allows the user to search queries that does not pass the
format check.
Methods
-------
get_df()
Returns the dataframe storing this search result.
search()
Executes the search.
"""
def __init__(
self,
verbosity=2,
return_max=20,
query=None,
accession=None,
organism=None,
layout=None,
mbases=None,
publication_date=None,
platform=None,
selection=None,
source=None,
strategy=None,
title=None,
suppress_validation=False,
):
try:
int_verbosity = int(verbosity)
if int_verbosity not in range(4):
raise ValueError
except (TypeError, ValueError):
raise IncorrectFieldException(
f"Incorrect verbosity format: {verbosity}\n"
"Verbosity must be an integer between 0 to 3 inclusive."
)
try:
int_return_max = int(return_max)
if int_return_max <= 0:
raise ValueError
except (TypeError, ValueError):
raise IncorrectFieldException(
f"Incorrect return_max format: {return_max}\n"
"return_max must be a positive integer."
)
self.verbosity = int_verbosity
self.return_max = int_return_max
self.fields = {
"query": query,
"accession": accession,
"organism": organism,
"layout": layout,
"mbases": mbases,
"publication_date": publication_date,
"platform": platform,
"selection": selection,
"source": source,
"strategy": strategy,
"title": title,
}
for k in self.fields:
if type(self.fields[k]) == list:
self.fields[k] = " ".join(self.fields[k])
self.df = pd.DataFrame()
# Verify that not all query fields are empty
if not any(self.fields.values()):
raise MissingQueryException()
if not suppress_validation:
self._validate_fields()
self.stats = {
"study": "-",
"experiment": "-",
"run": "-",
"sample": "-",
"Date range": "-",
"Organisms": "-",
"Library strategy": "-",
"Library source": "-",
"Library selection": "-",
"Library layout": "-",
"Platform": "-",
"count_mean": "-",
"count_median": "-",
"count_stdev": "-",
}
self.plot_objects = {}
def _input_multi_regex_checker(self, regex_matcher, input_query, error_message):
"""Checks if the user input match exactly 1 of the possible regex.
This is a helper method for _validate_fields. It takes as input a
dictionary of regex expression : accepted string by API pairs, and
an input string, and verifies that the input string matches exactly
one of the regex expressions.
Matching multiple expressions indicates the input string is
ambiguous, while matching none of the expressions suggests the
input will likely produce no results or an error.
Once matched, this method formats the user input so that it
can be accepted by the API to be queried, as ENA especially expect
case-sensitive exact matches for search queries.
Parameters
----------
regex_matcher : dict
dictionary of regex expression : accepted string by API pairs.
input_query : str
input string for a particular query field.
error_message : str
error message to be shown if input_query does not match any of
the regex expressions in regex_matcher.
Returns
-------
tuple
tuple pair of (input_query, message).
message is "" if no format error has been identified, error
message otherwise.
"""
matched_strings = []
for regex_expression in regex_matcher:
if re.match(regex_expression, input_query, re.IGNORECASE):
matched_strings.append(regex_matcher[regex_expression])
if not matched_strings:
return input_query, error_message
elif len(matched_strings) == 1:
return matched_strings[0], ""
else:
message = (
f"Multiple potential matches have been identified for {input_query}:\n"
f"{matched_strings}\n"
f"Please check your input.\n\n"
)
return input_query, message
def _validate_fields(self):
"""Verifies that user input format is correct.
This helper function tries to match the input query strings from
the user to the exact string that is accepted by both SRA and ENA
Note: as of the implementation of this method, ENA does not have
a documentation page listing the accepted values for query
parameters. The list of parameters below were collected from ENA's
advanced search page: https://www.ebi.ac.uk/ena/browser/advanced-search
Updating new values:
If any new values are accepted by ENA, it should appear under
the corresponding parameter in the page. To update this method,
think of a regex that captures what the user may type for the new
value, and include it in the respective xxx_matcher dictionary
below as a regex:value key value pair.
Eg: if a new sequencing platform, Pokemon, is added to ENA,
navigate to "Instrument Platform" parameter on ENA's advanced
search page and copy the corresponding phrase (eg "poKe_Mon").
Then add the key value pair ".*poke.*": "poKe_Mon" to
platform_matcher below.
Unlike SRA, ENA requires supplied param values to be exact match
to filter accordingly (eg "cDNA_oligo_dT"), which motivated this
feature.
Raises
------
IncorrectFieldException
If the input to any query field is in the wrong format
"""
message = ""
# verify layout
if self.fields["layout"] and str(self.fields["layout"]).upper() not in [
"SINGLE",
"PAIRED",
]:
message += (
f"Incorrect layout field format: {self.fields['layout']}\n"
"--layout must be either SINGLE or PAIRED\n\n"
)
# verify mbases
if self.fields["mbases"]:
try:
self.fields["mbases"] = int(self.fields["mbases"])
if self.fields["mbases"] <= 0:
raise ValueError
except (ValueError, TypeError):
message += (
f"Incorrect mbases format: {self.fields['mbases']}\n"
f"--mbases must be a positive integer\n\n"
)
# verify publication_date
date_regex = "(0[1-9]|[12][0-9]|3[01])-(0[1-9]|1[012])-(19|20)[0-9]{2}"
if self.fields["publication_date"] and not re.match(
f"^{date_regex}(:{date_regex})?$", self.fields["publication_date"]
):
message += (
f"Incorrect publication date format: {self.fields['publication_date']}\n"
f"Expected --publication-date format: dd-mm-yyyy or dd-mm-yyyy:dd-mm-yyyy, between 1900-2099\n\n"
)
# verify platform
platform_matcher = {
".*oxford.*|.*nanopore.*": "OXFORD_NANOPORE",
".*illumina.*": "ILLUMINA",
".*ion.*torrent.*": "ION_TORRENT",
".*capillary.*": "CAPILLARY",
".*pacbio.*|.*smrt.*": "PACBIO_SMRT",
".*abi.*solid.*": "ABI_SOLID",
".*bgi.*": "BGISEQ",
".*454.*": "LS454",
".*complete.*genomics.*": "COMPLETE_GENOMICS",
".*helicos.*": "HELICOS",
}
if self.fields["platform"]:
error_message = (
f"Incorrect platform: {self.fields['platform']}\n"
f"--platform must be one of the following: \n"
f"OXFORD_NANOPORE, ILLUMINA, ION_TORRENT, \n"
f"CAPILLARY, PACBIO_SMRT, ABI_SOLID, \n"
f"BGISEQ, LS454, COMPLETE_GENOMICS, HELICOS\n\n"
)
output = self._input_multi_regex_checker(
platform_matcher, self.fields["platform"], error_message
)
if output[1]:
message += output[1]
else:
self.fields["platform"] = output[0]
# verify selection
selection_matcher = {
".*methylcytidine.*": "5-methylcytidine antibody",
".*cage.*": "CAGE",
r".*chip\s*$": "ChIP",
".*chip.*seq.*": "ChIP-Seq",
".*dnase.*": "DNase",
".*hmpr.*": "HMPR",
".*hybrid.*": "Hybrid Selection",
r".*inverse.*rrna\s*$": "Inverse rRNA",
".*inverse.*rrna.*selection.*": "Inverse rRNA selection",
".*mbd2.*protein.*methyl.*cpg.*binding.*domain.*": "MBD2 protein methyl-CpG binding domain",
".*mda.*": "MDA",
".*mf.*": "MF",
".*mnase.*": "MNase",
".*msll.*": "MSLL",
r"^\s*oligo.*dt.*": "Oligo-dT",
r"^\s*pcr\s*$": "PCR",
".*poly[ -_]*a.*": "PolyA",
".*race.*": "RACE",
r".*random\s*$": "RANDOM",
".*random.*pcr.*": "RANDOM PCR",
".*rt[ -_]*pcr.*": "RT-PCR",
".*reduced.*representation.*": "Reduced Representation",
".*restriction.*digest.*": "Restriction Digest",
r".*cdna\s*$": "cDNA",
".*cdna.*oligo.*dt": "cDNA_oligo_dT.*", # ENA only
".*cdna.*random.*priming": "cDNA_randomPriming.*", # ENA only
".*other.*": "other",
".*padlock.*probes.*capture.*method.*": "padlock probes capture method",
".*repeat.*fractionation.*": "repeat fractionation",
".*size.*fractionation.*": "size fractionation",
".*unspecified.*": "unspecified",
}
if self.fields["selection"]:
error_message = (
f"Incorrect selection: {self.fields['selection']}\n"
f"--selection must be one of the following: \n"
f"5-methylcytidine antibody, CAGE, ChIP, ChIP-Seq, DNase, HMPR, Hybrid Selection, \n"
f"Inverse rRNA, Inverse rRNA selection, MBD2 protein methyl-CpG binding domain, \n"
f"MDA, MF, MNase, MSLL, Oligo-dT, PCR, PolyA, RACE, RANDOM, RANDOM PCR, RT-PCR, \n"
f"Reduced Representation, Restriction Digest, cDNA, cDNA_oligo_dT, cDNA_randomPriming \n"
f"other, padlock probes capture method, repeat fractionation, size fractionation, \n"
f"unspecified\n\n"
)
output = self._input_multi_regex_checker(
selection_matcher, self.fields["selection"], error_message
)
if output[1]:
message += output[1]
else:
self.fields["selection"] = output[0]
# verify source
source_matcher = {
r"^\s*genomic\s*$": "GENOMIC",
".*genomic.*single.*cell.*": "GENOMIC SINGLE CELL",
".*metagenomic.*": "METAGENOMIC",
".*metatranscriptomic.*": "METATRANSCRIPTOMIC",
".*other.*": "OTHER",
".*synthetic.*": "SYNTHETIC",
r"^\s*transcriptomic\s*$": "TRANSCRIPTOMIC",
".*transcriptomic.*single.*cell.*": "TRANSCRIPTOMIC SINGLE CELL",
".*viral.*rna.*": "VIRAL RNA",
}
if self.fields["source"]:
error_message = (
f"Incorrect source: {self.fields['source']}\n"
f"--source must be one of the following: \n"
f"GENOMIC, GENOMIC SINGLE CELL, METAGENOMIC, \n"
f"METATRANSCRIPTOMIC, OTHER, SYNTHETIC, \n"
f"TRANSCRIPTOMIC, TRANSCRIPTOMIC SINGLE CELL, VIRAL RNA\n\n"
)
output = self._input_multi_regex_checker(
source_matcher, self.fields["source"], error_message
)
if output[1]:
message += output[1]
else:
self.fields["source"] = output[0]
# verify strategy
strategy_matcher = {
".*amplicon.*": "AMPLICON",
".*atac.*": "ATAC-seq",
".*bisulfite.*": "Bisulfite-Seq",
r"^\s*clone\s*$": "CLONE",
".*cloneend.*": "CLONEEND",
".*cts.*": "CTS",
".*chia.*|.*pet.*": "ChIA-PET",
".*chip.*seq.*": "ChIP-Seq",
".*dnase.*|.*hypersensitivity.*": "DNase-Hypersensitivity",
r"^\s*est\s*$": "EST",
".*faire.*": "FAIRE-seq",
".*finishing.*": "FINISHING",
".*fl.*cdna.*": "FL-cDNA",
".*hi.*c.*": "Hi-C",
".*mbd.*": "MBD-Seq",
".*mnase.*": "MNase-Seq",
".*mre.*": "MRE-Seq",
".*medip.*": "MeDIP-Seq",
".*other.*": "OTHER",
".*poolclone.*": "POOLCLONE",
".*rad.*": "RAD-Seq",
".*rip.*": "RIP-Seq",
r"^\s*rna.*seq": "RNA-Seq",
".*selex.*": "SELEX",
".*synthetic.*|.*long.*read.*": "Synthetic-Long-Read",
".*targeted.*capture.*": "Targeted-Capture",
".*tethered.*chromatin.*conformation.*capture.*|.*tccc.*": "Tethered Chromatin Conformation Capture",
".*tn.*": "Tn-Seq",
".*validation.*": "VALIDATION",
".*wcs.*": "WCS",
".*wga.*": "WGA",
".*wgs.*": "WGS",
".*wxs.*": "WXS",
".*mirna.*": "miRNA-Seq",
".*ncrna.*": "ncRNA-Seq",
".*ssrna.*": "ssRNA-seq",
".*gbs.*": "GBS",
}
if self.fields["strategy"]:
error_message = (
f"Incorrect strategy: {self.fields['strategy']}\n"
f"--strategy must be one of the following: \n"
f"AMPLICON, ATAC-seq, Bisulfite-Seq, CLONE, CLONEEND, CTS, ChIA-PET, ChIP-Seq, \n"
f"DNase-Hypersensitivity, EST, FAIRE-seq, FINISHING, FL-cDNA, Hi-C, MBD-Seq, MNase-Seq,\n"
f"MRE-Seq, MeDIP-Seq, OTHER, POOLCLONE, RAD-Seq, RIP-Seq, RNA-Seq, SELEX, \n"
f"Synthetic-Long-Read, Targeted-Capture, Tethered Chromatin Conformation Capture, \n"
f"Tn-Seq, VALIDATION, WCS, WGA, WGS, WXS, miRNA-Seq, ncRNA-Seq, ssRNA-seq, GBS\n\n"
)
output = self._input_multi_regex_checker(
strategy_matcher, self.fields["strategy"], error_message
)
if output[1]:
message += output[1]
else:
self.fields["strategy"] = output[0]
if message:
raise IncorrectFieldException(message)
def _list_stat(self, stat_header):
stat = self.stats[stat_header]
if type(stat) != dict:
return f" {stat_header}: {stat}\n"
keys = sorted(stat.keys())
stat_breakdown = "\n"
for key in keys:
stat_breakdown += f"\t {key}: {stat[key]}\n"
return f" {stat_header}: {stat_breakdown}\n"
def show_result_statistics(self):
"""Shows search result statistics."""
if self.df.empty:
print(
"No results are found for the current search query, hence no statistics can be generated."
)
return
stats = (
"\n Statistics for the search query:\n"
+ " =================================\n"
+ f" Number of unique studies: {self.stats['study']}\n"
+ f" Number of unique experiments: {self.stats['experiment']}\n"
+ f" Number of unique runs: {self.stats['run']}\n"
+ f" Number of unique samples: {self.stats['sample']}\n"
+ f" Mean base count of samples: {self.stats['count_mean']:.3f}\n"
+ f" Median base count of samples: {self.stats['count_median']:.3f}\n"
+ f" Sample base count standard deviation: {self.stats['count_stdev']:.3f}\n"
)
# Statistics with categorical breakdowns:
categorical_stats = (
"Date range",
"Organisms",
"Platform",
"Library strategy",
"Library source",
"Library selection",
"Library layout",
)
for categorical_stat in categorical_stats:
stats += self._list_stat(categorical_stat)
print(stats)
def visualise_results(
self, graph_types=("all",), show=False, saveto="./search_plots/"
):
"""Generate graphs that visualise the search results.
This method will only work if the optional dependency, matplotlib,
is installed in the system.
Parameters
----------
graph_types : tuple
tuple containing strings representing types of graphs to
generate.
Possible strings:
all, daterange, organism, source, selection, platform,
basecount
saveto : str
directory name where the generated graphs are saved.
show : bool
Whether plotted graphs are immediately shown.
"""
if self.df.empty:
print(
"No results are found for the current search query, hence no graphs can be generated."
)
return
try:
import matplotlib.pyplot as plt
except ImportError:
print(
"The optional dependency, matplotlib, is not available on the system.\n"
"matplotlib is required to generate graphs to visualise search results.\n"
'You can install matplotlib by typing "pip install matplotlib" on the command line.\n'
)
return
plt.rcParams["figure.autolayout"] = True
if not os.path.isdir(saveto):
os.mkdir(saveto)
plots = [
("Base Count",),
("Publication Date",),
("Organism",),
("Library Source",),
("Library Selection",),
("Platform",),
("Organism", "Publication Date"),
("Library Source", "Publication Date"),
("Library Selection", "Publication Date"),
("Platform", "Publication Date"),
("Library Source", "Organism"),
("Library Selection", "Organism"),
("Platform", "Organism"),
("Library Selection", "Library Source"),
("Platform", "Library Source"),
("Platform", "Library Selection"),
]
plot_keys = {
"daterange": "Publication Date",
"organism": "Organism",
"source": "Library Source",
"selection": "Library Selection",
"platform": "Platform",
"basecount": "Base Count",
}
if "all" not in graph_types:
selected_plots = []
for graph_type in graph_types:
if graph_type not in plot_keys:
continue
for plot in plots:
if plot_keys[graph_type] in plot and plot not in selected_plots:
selected_plots.append(plot)
plots = selected_plots
too_many_organisms = False
if self.stats["graph_raw"]["Organism"].nunique() > 30:
print(
"Too many types of organisms to plot (>30). Showing only top 30 organisms."
)
too_many_organisms = True
for plot in plots:
self._plot_graph(plt, plot, show, saveto, too_many_organisms)
def search(self):
pass
def get_df(self):
"""Getter for the search result dataframe."""
return self.df
def get_plot_objects(self):
"""Get the plot objects for plots generated."""
return self.plot_objects
def _plot_graph(self, plt, axes, show, savedir, too_many_organisms):
"""Plots a graph based on data from self.stats
Parameters
----------
axes: tuple
tuple containing 1 or 2 strings corresponding to the statistics
to plot. 1 string: Histogram. 2 string: heat map
savedir: str
directory to save to
show: bool
whether to call plt.show
"""
timestamp = time.strftime("%Y-%m-%d %H-%M-%S")
if (
"Publication Date" in axes
and self.stats["graph_raw"]["Publication Date"].nunique() > 30
):
self.stats["graph_raw"]["Publication Date"] = self.stats["graph_raw"][
"Publication Date"
].str[:-3]
if axes == ("Base Count",):
count = list(self.stats["count_data"])
if len(count) == 0:
return
title = "Histogram of Base Count"
plt.figure(figsize=(15, 10))
plt.hist(count, min(70, len(count)), color="#135c1c", log=True)
plt.xlabel("Base Count", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.xticks(rotation=90)
plt.title(title, fontsize=18)
plt.savefig(f"{savedir}{title} {timestamp}.svg")
self.plot_objects[axes] = plt
elif len(axes) == 1:
title = f"Histogram of {axes[0]}"
data = self.stats["graph_raw"][axes[0]].value_counts()
if too_many_organisms:
data = data[:30]
plt.figure(figsize=(15, 10))
plt.bar(
range(len(data.values)),
data.values,
tick_label=list(data.index),
color="#135c1c",
)
plt.xticks(rotation=90)
plt.title(title, fontsize=18)
plt.xlabel(axes[0], fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.savefig(f"{savedir}{title} {timestamp}.svg")
self.plot_objects[axes] = plt
elif len(axes) == 2:
title = f"Heatmap of {axes[0]} against {axes[1]}"
df = self.stats["graph_raw"][list(axes)]
a = df.groupby([axes[0]]).agg({i: "value_counts" for i in df.columns[1:]})
a = a.rename(columns={axes[1]: f"{axes[1]}_count"})
b = a.reset_index(level=list(axes))
piv = (
pd.pivot_table(
b,
values=f"{axes[1]}_count",
index=[axes[0]],
columns=[axes[1]],
fill_value=0,
aggfunc="sum",
margins=True,
)
.sort_values("All", ascending=False)
.drop("All", axis=1)
.sort_values("All", ascending=False, axis=1)
.drop("All")
)
if too_many_organisms:
if axes[0] == "Organism":
piv = piv[:30]
else:
piv = piv.iloc[:, :30]
fig, ax = plt.subplots(figsize=(15, 12))
im = ax.imshow(piv, cmap="Greens")
fig.colorbar(im, ax=ax)
ax.set_title(title, fontsize=18)
ax.set_xticks(range(len(piv.columns)))
ax.set_yticks(range(len(piv.index)))
ax.set_xticklabels(piv.columns, rotation=90)
ax.set_yticklabels(piv.index)
ax.set_ylabel(axes[0], fontsize=14)
ax.set_xlabel(axes[1], fontsize=14)
ax.get_figure().savefig(f"{savedir}{title} {timestamp}.svg")
self.plot_objects[axes] = (fig, ax)
if show:
plt.show()
class SraSearch(QuerySearch):
"""Subclass of QuerySearch that implements search by querying
NCBI Entrez API
Methods
-------
search()
sends the user query via requests to NCBI Entrez API and returns
search results as a pandas dataframe.
_format_query_string()
formats the input user query into a string
_format_request()
formats the request payload
_format_result(content)
formats the search query output.
See Also
--------
QuerySearch: Superclass of SraSearch
"""
def __init__(
self,
verbosity=2,
return_max=20,
query=None,
accession=None,
organism=None,
layout=None,
mbases=None,
publication_date=None,
platform=None,
selection=None,
source=None,
strategy=None,
title=None,
suppress_validation=False,
):
super().__init__(
verbosity,
return_max,
query,
accession,
organism,
layout,
mbases,
publication_date,
platform,
selection,
source,
strategy,
title,
suppress_validation,
)
self.entries = {}
self.number_entries = 0
def search(self):
# Step 1: retrieves the list of uids that satisfies the input
# search query
payload = self._format_request()
try:
r = requests_3_retries().get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
params=payload,
timeout=SEARCH_REQUEST_TIMEOUT,
)
r.raise_for_status()
uids = r.json()["esearchresult"]["idlist"]
# Step 2: retrieves the detailed information for each uid
# returned, in groups of SRA_SEARCH_GROUP_SIZE.
if not uids:
print(
f"No results found for the following search query: \n {self.fields}"
)
return # If no queries found, return nothing
pbar = tqdm(total=len(uids))
for i in range(0, len(uids), SRA_SEARCH_GROUP_SIZE):
current_uids = ",".join(
uids[i : min(i + SRA_SEARCH_GROUP_SIZE, len(uids))]
)
pbar.update(min(SRA_SEARCH_GROUP_SIZE, len(uids) - i))
payload2 = {"db": "sra", "retmode": "xml", "id": current_uids}
r = requests_3_retries().get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
params=payload2,
timeout=SEARCH_REQUEST_TIMEOUT,
stream=True,
)
r.raise_for_status()
r.raw.decode_content = True
self._format_response(r.raw)
pbar.close()
self._format_result()
if self.verbosity >= 2:
self.df["pmid"] = list(uids)
except requests.exceptions.Timeout:
sys.exit(f"Connection to the server has timed out. Please retry.")
except requests.exceptions.HTTPError:
sys.exit(
f"HTTPError: This is likely caused by an invalid search query: "
f"\nURL queried: {r.url} \nUser query: {self.fields}"
)
def _format_query_string(self):
term = ""
if self.fields["query"]:
term += self.fields["query"] + " AND "
if self.fields["accession"]:
term += self.fields["accession"] + "[Accession] AND "
if self.fields["organism"]:
term += self.fields["organism"] + "[Organism] AND "
if self.fields["layout"]:
term += self.fields["layout"] + "[Layout] AND "
if self.fields["mbases"]:
term += str(self.fields["mbases"]) + "[Mbases] AND "
if self.fields["publication_date"]:
term += self.fields["publication_date"].replace("-", "/") + "[PDAT] AND "
if self.fields["platform"]:
term += self.fields["platform"] + "[Platform] AND "
if self.fields["selection"]:
term += self.fields["selection"] + "[Selection] AND "
if self.fields["source"]:
term += self.fields["source"] + "[Source] AND "
if self.fields["strategy"]:
term += self.fields["strategy"] + "[Strategy] AND "
if self.fields["title"]:
term += self.fields["title"] + "[Title] AND "
return term[:-5] # Removing trailing " AND "
def _format_request(self):
payload = {
"db": "sra",
"term": self._format_query_string(),
"retmode": "json",
"retmax": self.return_max,
}
return payload
def _format_response(self, content):
field_categories = [
"EXPERIMENT",
"SUBMISSION",
"ORGANISATION",
"STUDY",
"SAMPLE",
"Pool",
"RUN_SET",
]
for event, elem in Et.iterparse(content):
if elem.tag == "EXPERIMENT_PACKAGE":
self.number_entries += 1
elif elem.tag in field_categories:
self._parse_entry(elem)
for field in self.entries:
if len(self.entries[field]) < self.number_entries:
self.entries[field] += [""] * (
self.number_entries - len(self.entries[field])
)
def _format_result(self):
self.df = pd.DataFrame.from_dict(self.entries).replace(
r"^\s*$", "N/A", regex=True
)
self.entries.clear()
if self.df.empty:
return
# Tabulate statistics
self._update_stats()
columns = list(self.df.columns)
important_columns = [
"study_accession",
"experiment_accession",
"experiment_title",
"design_description",
"sample_taxon_id",
"sample_scientific_name",
"experiment_library_strategy",
"experiment_library_source",
"experiment_library_selection",
"sample_accession",
"sample_alias",
"experiment_instrument_model",
"pool_member_spots",
"run_1_size",
"run_1_accession",
"run_1_total_spots",
"run_1_total_bases",
]
temp_cols = []
for col in important_columns:
if col in columns:
temp_cols.append(col)
columns.remove(col)
important_columns = temp_cols
if self.verbosity <= 1:
temp_dfs = []
for col in self.df.columns:
if re.match("run_[0-9]+_accession", col):
temp_df = self.df[[col, "experiment_title"]]
temp_df = temp_df[temp_df[col] != "N/A"].rename(
columns={
col: "run_accession",
"experiment_title": "experiment_title",
}
)
temp_dfs.append(temp_df)
run_dataframe = pd.concat(temp_dfs)
run_dataframe.sort_values(by=["run_accession"], kind="mergesort")
self.df = run_dataframe
if self.verbosity == 0:
self.df = self.df[["run_accession"]]
elif self.verbosity == 1:
pass # df has already been formatted above
elif self.verbosity == 2:
self.df = self.df[important_columns]
elif self.verbosity == 3:
self.df = self.df[important_columns + sorted(columns)]
self.df.dropna(how="all")
def _parse_entry(self, entry_root):
"""Parses a subset of the XML tree from request stream
Parameters
----------
entry_root: ElementTree.Element
root element of the xml tree from requests stream
"""
field_header = entry_root.tag.lower()
run_count = 0
# root element attributes
for k, v in entry_root.attrib.items():
self._update_entry(f"{field_header}_{k}".lower(), v)
for child in entry_root:
# "*_REF" tags contain duplicate information that can be found
# somewhere in the xml entry and are skipped
if child.tag.endswith("_REF"):
continue
# IDENTIFIERS contain two types of children tags:
# PRIMARY_ID, which repeats the accession number and
# EXTERNAL_ID tags, each containing an alternative ID that is
# typically used in another database (GEO, ENA etc)
# IDs are numbered from 1 to differentiate between them.
elif child.tag == "IDENTIFIERS":
id_index = 1
for identifier in child:
if identifier.tag == "EXTERNAL_ID":
self._update_entry(
f"{field_header}_external_id_{id_index}",
identifier.text,
)
self._update_entry(
f"{field_header}_external_id_{id_index}_namespace",
identifier.get("namespace"),
)
# "*_LINKS" tags contain 0 or more "*_LINK" children tags,
# each containing information (values) regarding the link
# Links are numbered from 1 to differentiate between multiple
# links.
elif child.tag.endswith("_LINKS"):
link_index = 1
for link in child:
# Link type. Eg: URL_link, Xref_link
self._update_entry(
f"{link.tag}_{link_index}_type".lower(),
link[0].tag,
)
# Link values in the form of tag: value.
# Eg: label: GEO sample
link_value_index = 1
for link_value in link[0]:
self._update_entry(
f"{link.tag}_{link_index}_value_{link_value_index}".lower(),
f"{link_value.tag}: {link_value.text}",
)
link_value_index += 1
link_index += 1
# "*_ATTRIBUTES" tags contain tag - value pairs providing
# additional information for the Experiment/Sample/Study
# Attributes are numbered from 1 to differentiate between
# multiple attributes.
elif child.tag.endswith("_ATTRIBUTES"):
attribute_index = 1
for attribute in child:
for val in attribute:
self._update_entry(
f"{child.tag}_{attribute_index}_{val.tag}".lower(),
val.text,
)
attribute_index += 1
# Differentiating between sample title and experiment title.
elif child.tag == "TITLE":
self._update_entry(f"{field_header}_title", child.text)
# Parsing platfrom information
elif child.tag == "PLATFORM":
platform = child[0]
self._update_entry("experiment_platform", platform.tag)
self._update_entry(
"experiment_instrument_model",
platform[0].text,
)
# Parsing individual run information
elif child.tag == "RUN":
run_count += 1
# run attributes
for k, v in child.attrib.items():
self._update_entry(f"run_{run_count}_{k}".lower(), v)
for elem in child:
if elem.tag == "SRAFiles":
srafile_index = 1
for srafile in elem:
for k, v in srafile.attrib.items():
self._update_entry(
f"run_{run_count}_srafile_{srafile_index}_{k}".lower(),
v,
)
alternatives_index = 1
for alternatives in srafile: