In [6]:
! pip list

Package                            Version
---------------------------------- -------------------
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 2.0.3
anaconda-project                   0.9.1
anyio                              2.2.0
appdirs                            1.4.4
applaunchservices                  0.2.1
appnope                            0.1.2
appscript                          1.1.2
argh                               0.26.2
argon2-cffi                        20.1.0
asn1crypto                         1.4.0
astroid                            2.5
astropy                            4.2.1
async-generator                    1.10
atomicwrites                       1.4.0
attrs                              21.4.0
autograd                           1.6.2
autopep8                           1.5.6
Babel                              2.9.0
backcall                           0.2.0
backports.functools-lru-cache      1.6.4

In [1]:
! pip install -r requirements.txt



In [1]:
from sklearn.datasets import make_classification
import time
from Bio import Entrez
from bioinformatics_toolbox import DNASequence, RNASequence, AminoAcidSequence
from bio_files_processor import OpenFasta
from custom_random_forest import RandomForestClassifierCustom


40.0


## RandomForestClassifierCustom

In [13]:
X, y = make_classification(n_samples=1000000)

In [14]:
random_forest = RandomForestClassifierCustom(max_depth=20, n_estimators=50, 
                                             max_features=2, random_state=42)

In [15]:
%%time
fit_1_job = random_forest.fit(X, y, n_jobs = 1)

CPU times: user 346 ms, sys: 900 ms, total: 1.25 s
Wall time: 5min 54s


In [16]:
%%time
fit_2_jobs = random_forest.fit(X, y, n_jobs = 2)

CPU times: user 964 ms, sys: 2.82 s, total: 3.79 s
Wall time: 3min 28s


In [17]:
%%time
predict_1_job = random_forest.predict(X, n_jobs=1)

CPU times: user 1.3 s, sys: 2.69 s, total: 3.99 s
Wall time: 15.3 s


In [18]:
%%time
predict_2_jobs = random_forest.predict(X, n_jobs=2)

CPU times: user 1.58 s, sys: 3.76 s, total: 5.35 s
Wall time: 10.9 s


In [19]:
result = all(x == y for x, y in zip(predict_1_job, predict_2_jobs))
print(result)

True


## AminoAcidSequence/DNASequence/RNASequence

##### DNASequence

In [20]:
dna_sequence = DNASequence("ATTTTCGC", "DNA")
rna_sequence = RNASequence("AUCGUCUC", "RNA")
print(f"DNA sequence GC-content: {dna_sequence.gc_content()}")
print(f"RNA sequence GC-content: {rna_sequence.gc_content()}")

DNA sequence GC-content: 37.5
RNA sequence GC-content: 50.0


##### RNASequence

In [23]:
rna_sequence = RNASequence("AUGAAAUGGCGCCCCACCUAA", "RNA")
protein = rna_sequence.translation()
print("Protein sequence:", protein)

Protein sequence: MKWRPT*


##### AminoAcidSequence

In [25]:
amino_acid_sequence = AminoAcidSequence("ACGCGIILKLV", "protein")
hydrophobicity = amino_acid_sequence.compute_hydrophobicity()
print(f"Amino Acid hydrophobicity: {hydrophobicity}%")

Amino Acid hydrophobicity: 54.545%


## OpenFasta

In [5]:
with OpenFasta('data/test.fasta') as fasta_file:
    print(fasta_file.read_record())
    print(fasta_file.read_record())
    print(fasta_file.read_record())
    print(fasta_file.read_record())
    print(fasta_file.read_record())



0123 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+)
ACGGCCATACTTTGAAAGCACCGCATCCCGGACTTCTTTGAAAGCACCGCATCCCTGAAAGCACCGCATCTTTGAAAGCACCGCATCCCCCCGTCCGATC...

4567 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+)
TTGGCTTCTTAGAGGGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTTTGGCTTCTTAGAGGGATGCCCTTAGATGTTCTGGGCCGCACGCGC...

1111 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+)
TTGAAGAGCTGTAGCTTAACCTTCGGGAGGGCGAGTCGTAAGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCGCTGACGAGTGGG...

8910 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-)
TTGAAGAGTTTGATAGGAAACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGGGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCGAAGT...

1112 16S_rRNA NODE_4_length_428221_cov_75.638017:281055-282593(-)
CAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCC...



In [6]:
with OpenFasta('data/test.fasta') as fasta_file:
    records = fasta_file.read_records()
    print(records)

[0123 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+)
ACGGCCATACTTTGAAAGCACCGCATCCCGGACTTCTTTGAAAGCACCGCATCCCTGAAAGCACCGCATCTTTGAAAGCACCGCATCCCCCCGTCCGATC...
, 4567 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+)
TTGGCTTCTTAGAGGGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTTTGGCTTCTTAGAGGGATGCCCTTAGATGTTCTGGGCCGCACGCGC...
, 1111 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+)
TTGAAGAGCTGTAGCTTAACCTTCGGGAGGGCGAGTCGTAAGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCGCTGACGAGTGGG...
, 8910 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-)
TTGAAGAGTTTGATAGGAAACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGGGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCGAAGT...
, 1112 16S_rRNA NODE_4_length_428221_cov_75.638017:281055-282593(-)
CAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCC...
]


### Tests

In [2]:
%run -m pytest

platform darwin -- Python 3.8.8, pytest-8.2.0, pluggy-1.5.0
rootdir: /Users/nikitazherko/Desktop/BI/learning/Python/class_18
plugins: anyio-2.2.0
collected 8 items

test_bioinformatics_toolbox.py ........                                  [100%]

