In [None]:
#@title Intro
%%html
<div style="display: flex; flex-direction: column; align-items: center; justify-content: center; padding: 20px; background-color: #aadcfe; color: white; border-radius: 10px; margin: 20px auto; text-align: center; width: 80%;">
    <img src="https://upload.wikimedia.org/wikipedia/fa/a/a9/Sharif_logo.svg" alt="Sharif University Logo" width="120">
    <h2 style="font-size: 2em; margin: 10px;">Introduction to Bioinformatics</h2>
    <h3 style="font-size: 1.8em; margin: 10px;">Final Project</h3>
    <h3 style="font-size: 1.5em; margin: 10px; color: #ff5858;">Transcriptomic Insights into Skin Cancer Biomarkers</h3>
</div>


<div style="display: flex; flex-direction: column; align-items: center; justify-content: center; font-size: 1.2em; line-height: 1.6; padding: 10px; margin: 20px auto; text-align: center; width: 80%;">
    <strong>Instructor:</strong> Dr. Ali Sharifi Zarchi <br>
    <strong>Course:</strong> Introduction to Bioinformatics <br>
    <strong>Semester:</strong> Fall 2024 <br>
    <strong>Authors:</strong> Sahand Akramipour / Yousef Miryousefi <br>
</div>


<div style="display: flex; flex-direction: column; align-items: center; justify-content: center; font-size: 1.2em; line-height: 1.6; padding: 10px; margin: 20px auto; text-align: center; width: 80%;">
    <strong>Abstract</strong>
</div>
<div style="background-color: #f5f5f5; padding: 20px; border-left: 5px solid #00274d; margin: 20px auto; text-align: justify; width: 80%;">
    This research article investigates the genetic mechanisms underlying skin cancer using a bioinformatics approach. The authors analyzed transcriptome data from the European Nucleotide Archive, identifying 19 differentially expressed genes associated with skin cancer. They employed bioinformatics tools like DESeq2, GSEA, and Cytoscape to analyze gene expression, pathway enrichment, and gene networks. Key genes involved in pathways such as IL6_JAK_STAT3_SIGNALING and ANGIOGENESIS were highlighted as potential therapeutic targets. The study's limitations, including sample size and data source biases, were acknowledged.
</div>


# Imports

In [None]:
import subprocess
import zipfile
import os
from google.colab import files
import re
import pandas as pd

# Fetch Data
In this part we obtain transcriptome sequencing data of skin cancer and adjacent normal tissues from the European Nucleotide Archive (ENA). The data is in raw `fastq.gz` format, which includes both the sequenced biological sequence and its corresponding quality scores. The data used in the study is associated with Project ID PRJNA546533 and consists of 12 datasets.

In [None]:
#@title Download Data
!wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/008/SRR9203418/SRR9203418.fastq.gz
# !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/005/SRR9203425/SRR9203425.fastq.gz
# !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/001/SRR9203421/SRR9203421.fastq.gz
# !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/004/SRR9203424/SRR9203424.fastq.gz
# !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/009/SRR9203419/SRR9203419.fastq.gz
# !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/006/SRR9203426/SRR9203426.fastq.gz
# !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/003/SRR9203423/SRR9203423.fastq.gz
# !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/002/SRR9203422/SRR9203422.fastq.gz
# !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/007/SRR9203427/SRR9203427.fastq.gz
# !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/007/SRR9203417/SRR9203417.fastq.gz
# !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/006/SRR9203416/SRR9203416.fastq.gz
# !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/000/SRR9203420/SRR9203420.fastq.gz

--2025-02-03 20:34:00--  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR920/008/SRR9203418/SRR9203418.fastq.gz
           => ‘SRR9203418.fastq.gz’
Resolving ftp.sra.ebi.ac.uk (ftp.sra.ebi.ac.uk)... 193.62.193.165
Connecting to ftp.sra.ebi.ac.uk (ftp.sra.ebi.ac.uk)|193.62.193.165|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /vol1/fastq/SRR920/008/SRR9203418 ... done.
==> SIZE SRR9203418.fastq.gz ... 765921990
==> PASV ... done.    ==> RETR SRR9203418.fastq.gz ... done.
Length: 765921990 (730M) (unauthoritative)


In [None]:
!ls

# Data Quality Control
In this part we use the FastQC tool to assess the quality of the raw data. This step evaluates parameters like total base quality, per-tile quality, GC content, and sequence length distribution.

In [None]:
#@title Install `Fastqc`
!wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.12.1.zip
!unzip fastqc_v0.12.1.zip
!chmod +x FastQC/nfastqc
!mkdir -p ./fastqc_results
!chmod +w ./fastqc_results

In [None]:
#@title Run `fastq` Analysis
fastq_files = [
    "SRR9203419.fastq.gz", "SRR9203423.fastq.gz", "SRR9203427.fastq.gz",
    "SRR9203416.fastq.gz", "SRR9203420.fastq.gz", "SRR9203424.fastq.gz",
    "SRR9203417.fastq.gz", "SRR9203421.fastq.gz", "SRR9203425.fastq.gz",
    "SRR9203418.fastq.gz", "SRR9203422.fastq.gz", "SRR9203426.fastq.gz"
]

for file in fastq_files:
    print(subprocess.run(["./FastQC/fastqc", file, "-o", "./fastqc_results"]))

CompletedProcess(args=['./FastQC/fastqc', 'SRR9203419.fastq.gz', '-o', './fastqc_results'], returncode=0)
CompletedProcess(args=['./FastQC/fastqc', 'SRR9203423.fastq.gz', '-o', './fastqc_results'], returncode=0)
CompletedProcess(args=['./FastQC/fastqc', 'SRR9203427.fastq.gz', '-o', './fastqc_results'], returncode=0)
CompletedProcess(args=['./FastQC/fastqc', 'SRR9203416.fastq.gz', '-o', './fastqc_results'], returncode=0)
CompletedProcess(args=['./FastQC/fastqc', 'SRR9203420.fastq.gz', '-o', './fastqc_results'], returncode=0)
CompletedProcess(args=['./FastQC/fastqc', 'SRR9203424.fastq.gz', '-o', './fastqc_results'], returncode=0)
CompletedProcess(args=['./FastQC/fastqc', 'SRR9203417.fastq.gz', '-o', './fastqc_results'], returncode=0)
CompletedProcess(args=['./FastQC/fastqc', 'SRR9203421.fastq.gz', '-o', './fastqc_results'], returncode=0)
CompletedProcess(args=['./FastQC/fastqc', 'SRR9203425.fastq.gz', '-o', './fastqc_results'], returncode=0)
CompletedProcess(args=['./FastQC/fastqc', 'SRR

In [None]:
#@title Download Analysis
output_dir = "./fastqc_results"
zip_file = "fastqc_results.zip"

with zipfile.ZipFile(zip_file, 'w') as zipf:
    for foldername, subfolders, filenames in os.walk(output_dir):
        for filename in filenames:
            zipf.write(os.path.join(foldername, filename), arcname=filename)

print(f"FastQC results have been zipped into {zip_file}")
files.download(zip_file)

FastQC results have been zipped into fastqc_results.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#@title Report
dfs = []

for f in os.listdir(output_dir):
  if f.endswith('.zip'):
    zip_path = os.path.join(output_dir, f)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
      file_names = zip_ref.namelist()
      for file_name in file_names:
          if file_name.endswith('fastqc_data.txt'):
            with zip_ref.open(file_name) as file:
              content = file.read().decode('utf-8')
              match = re.search(r'#Measure\s+Value(.*?)>>END_MODULE', content, re.DOTALL)
              extracted_content = match.group(1).strip()

              data_dict = {key.split('\t')[0].strip(): key.split('\t')[1].strip() for key in extracted_content.split('\n')}

              dfs.append(pd.DataFrame([data_dict], index=[file_name.split('_')[0]]))

pd.concat(dfs).rename_axis(['Run Accession']).drop(columns=['Filename', 'File type', 'Encoding'])

NameError: name 'zip_dir' is not defined

# Mapping and Alignment of Transcriptome Data
In this step we align the high-quality data to the human reference genome GRCh38/hg38, which can be obtained from the ENSEMBL database. We use BOWTIE2 software to align reads to the reference genome. This process generates a Sequence Alignment Map (SAM) file, which is then converted to a Binary Alignment Map (BAM) format using SAMTools for efficient storage and processing. The BAM alignment files are sorted to optimize memory usage for subsequent analyses.

In [None]:
#@title Download reference genome
!wget ftp://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
!gunzip Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz

In [None]:
#@title Build BOWTIE2 index
!sudo apt -qq install bowtie2
!wget -P genome_index/ https://genome-idx.s3.amazonaws.com/bt/GRCh38_noalt_as.zip
!unzip genome_index/GRCh38_noalt_as.zip.1 -d genome_index/

In [None]:
#@title Align reads (single-end)
%%bash
for fastq in *.fastq.gz; do
    sample_name=$(basename $fastq .fastq.gz)
    bowtie2 -x genome_index/GRCh38_noalt_as/GRCh38_noalt_as -U $fastq -S ${sample_name}_aligned.sam
done

In [None]:
#@title Convert SAM to BAM using SAMTools
%%bash
sudo apt -qq install samtools
for samfile in *.sam; do
    samtools view -bS $samfile > ${samfile%.sam}.bam
    samtools sort ${samfile%.sam}.bam -o ${samfile%.sam}_sorted.bam
    samtools index ${samfile%.sam}_sorted.bam
done