In [None]:
import ray
import numpy as np
import pandas as pd
import joblib
import boto3

from collections import Counter
from ray.util.joblib import register_ray
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

from cloud_data_cockpit import DataCockpit

In [None]:
# Ray instantiation
ray.init()
register_ray()

In [None]:
# Replace by your ID
my_id = input()
print("Your ID is", my_id)

In [None]:
# Push random FASTA file to S3
s3 = boto3.Session().client('s3')
s3.create_bucket(Bucket=f"scipy-tutorial-{str(my_id)}")
s3.copy_object(
    CopySource={'Bucket': "scipy-tutorial-data", 'Key': 'random_100mb.fasta'},
    Bucket=f"scipy-tutorial-{str(my_id)}",
    Key='random_100mb.fasta'
)

## Loading and Partitioning FASTA Sequences with DataCockpit

In this cell, we will initialize the data loader and prepare a FASTA file for distributed processing with Ray. You should:

1. **Specify a FASTA file path**  
   - Point to the FASTA file you want to process (e.g. `sequences.fasta`).  

2. **Define the number of _chunks_**  
   - Choose into how many partitions (_chunks_) you want to split the sequence data.  
   - Proper chunking allows Ray to balance the workload across workers.

3. **Partition the FASTA file**  
   - Use DataCockpit to read and split the file into the defined number of _chunks_.  

4. **Run the rest of the notebook with Ray**  
   - After partitioning, Ray will manage parallel sequence processing.  
   - Ensure your Ray cluster is initialized before executing downstream analysis.


In [None]:
data_loader = DataCockpit()

In [None]:
slices = data_loader.get_data_slices()

In [None]:
# Ray remote function definition
@ray.remote
def process_fasta_slice(slice_id, data_slice):
    text = data_slice.get().decode("utf-8")
    lines = text.splitlines()
    records = []
    header = None
    seq_parts = []

    def flush():
        nonlocal header, seq_parts
        if header is None:
            return
        seq = "".join(seq_parts)
        length = len(seq)
        # Skip empty sequences
        if length == 0:
            return
        # Compute GC content percentage
        gc = (seq.count("G") + seq.count("C")) / length * 100
        # Count each amino acid and get their frequency
        aa_counts = Counter(seq)
        aa_freq = {
            aa: aa_counts.get(aa, 0) / length
            for aa in "ACDEFGHIKLMNPQRSTVWY"
        }
        # Extract species from header after the first underscore
        species = header.split("_", 1)[1] if "_" in header else header
        rec = {
            "header": header,
            "species": species,
            "length": length,
            "gc_percent": gc,
            **aa_freq
        }
        records.append(rec)

    for line in lines:
        if line.startswith(">"):
            # On new header, flush the previous record
            flush()
            header = line[1:].strip()
            seq_parts = []
        else:
            seq_parts.append(line.strip())
    # Flush the last sequence
    flush()
    return records

In [None]:
# Call for ray parallel computation
futures = [
    process_fasta_slice.remote(i, sl)
    for i, sl in enumerate(slices)
]

nested = ray.get(futures)
flat = [rec for part in nested for rec in part]

df = pd.DataFrame(flat)
grouped = df.groupby("species").mean(numeric_only=True).round(3)

print("Average composition by species (length, gc_percent, A…Y):")
print(grouped)

In [None]:
# Finish the environment
ray.shutdown()

In [None]:
# Finding the optimal chunk size (in dev.) 
def benchmark(slices):
    [slice.get() for slice in slices]

data_loader = DataCockpit(benchmarking_fn=benchmark)