# Step 1: RNA STAR
Here we perform sequence alignments on the input FASTQ files. This produces `.sam` files.

# Step 2: sambamba
Here we convert the `.sam` files to `.bam` files and sort them.

In [8]:
!# password.txt is a text file with your password.
!# If this gives an error (probably because you don't have sudo permissions),
!# try running apt install -y sambamba instead.
!sudo -S apt install -y sambamba < password.txt

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libllvm11 libphobos2-ldc-shared98
The following NEW packages will be installed:
  libllvm11 libphobos2-ldc-shared98 sambamba
0 upgraded, 3 newly installed, 0 to remove and 1 not upgraded.
Need to get 21.8 MB of archives.
After this operation, 92.9 MB of additional disk space will be used.
Get:1 http://mirrors.ocf.berkeley.edu/ubuntu jammy/universe amd64 libllvm11 amd64 1:11.1.0-6 [19.6 MB]
Get:2 http://mirrors.ocf.berkeley.edu/ubuntu jammy/universe amd64 libphobos2-ldc-shared98 amd64 1:1.28.0-1ubuntu1 [1,487 kB]
Get:3 http://mirrors.ocf.berkeley.edu/ubuntu jammy/universe amd64 sambamba amd64 0.8.2+dfsg-2 [695 kB]
Fetched 21.8 MB in 1s (21.4 MB/s)m[33m

7[0;23r8[1ASelecting previously unselected package libllvm11:amd64.
(Reading database ... 414490 files and directories currently installed.)Reading database ... 65%(Reading database 

In [2]:
!counter=1

!for sam_file in ./sam_files/*.sam; do
!    # Process each .bam file
!    echo "Processing $sam_file"
!    output_file="sorted_${counter}"
!
!    # Convert each sam file in ./sam_files to a bam file, and sort the bam files by name. The bam files will be  
!    ./sambamba view -S --format=bam $sam_file | ./sambamba sort -o "./sorted_files/sorted${counter}.bam" /dev/stdin 

!    ((counter++))
!    # Add your command to operate on the bam file here
!done

Processing ./sam_files/*.sam


bash: line 10: ./sambamba: No such file or directory
bash: line 10: ./sambamba: No such file or directory


# Step 3: htseq
Here we get sequence counts, and store them in CSV files.

In [None]:
# Getting a list of all bam files in ./sorted_files
bam_files=$(find ./sorted_files -type f -name "*.bam")

# Run htseq-count with all the BAM files
htseq-count --format=bam --order=name --stranded=yes $bam_files homo_sapien.gtf > output.csv

In [None]:
import htseq
import os
import subprocess

# Base directory
project_base = '/content/drive/MyDrive/csci597k_group_project/results'

# Define control and treatment directories
directories = {
    "Control_01": os.path.join(project_base, '00MatrigelCoating'),
    "Control_02": os.path.join(project_base, '00PLLCoating'),
    "Treatment_01": os.path.join(project_base, '01argin'),
    "Treatment_02": os.path.join(project_base, '02PanLaminin'),
    "Treatment_03": os.path.join(project_base, '03Laminin211'),
    "Treatment_04": os.path.join(project_base, '04CollagenI'),
}

# Path to GTF file
gtf_file = os.path.join(project_base, 'Homo_sapiens.GRCh38.133.gtf')

# Loop through each directory and process the files
for label, directory in directories.items():
    print(f"Processing directory: {directory}")
    for root, dirs, files in os.walk(directory):
        # this should in theory run once because it will be the first file
        for file in files:
            # Check for the Aligned.out.sam file
            if file == "Aligned.out.sam":
                # Construct full path to the SAM file
                sam_file_path = os.path.join(root, file)

                # Define the output file path
                output_file = os.path.join(root, f"{label}_htseq_counts.txt")

                # Construct the htseq-count command
                command = [
                    "htseq-count",
                    "-f", "sam",
                    "-r", "name",
                    "-s", "yes",  # Change to 'yes' or 'reverse' based on your data
                    sam_file_path,
                    gtf_file
                ]

                # Run htseq-count and save the output to a file
                print(f"Running htseq-count for: {sam_file_path}")
                with open(output_file, "w") as out:
                    subprocess.run(command, stdout=out, stderr=subprocess.PIPE)

                print(f"htseq-count completed for {label}. Results saved to {output_file}")