<a href="https://colab.research.google.com/github/rishidash12/gudu-code/blob/main/code_for_calculating_g_quadruplex_in_promoter_sequence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re

def find_g_quadruplexes(sequence):
    # Convert to uppercase and remove non-ATGCN chars
    sequence = re.sub(r'[^ATGCN]', '', sequence.upper())

    # G-quadruplex pattern
    g4_pattern = r'(G{3,})[ATCGN]{1,7}(G{3,})[ATCGN]{1,7}(G{3,})[ATCGN]{1,7}(G{3,})'

    matches = re.finditer(g4_pattern, sequence)

    results = []
    for match in matches:
        start = match.start()
        end = match.end()
        matched_seq = match.group()
        results.append({
            'start': start + 1,  # 1-based index
            'end': end,
            'sequence': matched_seq,
            'length': end - start
        })

    return results

# Example: promoter sequence (replace with actual sequence)
promoter_sequence = """
ATCGGGTGGTGAGGGTAGGGAGGGTAGCGGGTAGCGTAGCGTAGCTAGCGGGTTTGGGAGGGTAGGAGGGTGTT
"""

# Find G-quadruplexes
g4_hits = find_g_quadruplexes(promoter_sequence)

# Print results
print(f"Total G-quadruplex motifs found: {len(g4_hits)}\n")
for i, hit in enumerate(g4_hits, 1):
    print(f"G4-{i}: Start={hit['start']} End={hit['end']} Length={hit['length']}\nSequence: {hit['sequence']}\n")


Total G-quadruplex motifs found: 2

G4-1: Start=4 End=31 Length=28
Sequence: GGGTGGTGAGGGTAGGGAGGGTAGCGGG

G4-2: Start=50 End=70 Length=21
Sequence: GGGTTTGGGAGGGTAGGAGGG



In [None]:
# Import required module
import re  # Regular expressions module for pattern matching

# Function to find G-quadruplex motifs using a regex pattern
def find_g_quadruplexes(sequence):
    # Convert sequence to uppercase and remove any unwanted characters
    sequence = re.sub(r'[^ATGCN]', '', sequence.upper())

    # Define the G-quadruplex motif pattern:
    # Four stretches of ≥3 guanines separated by loops of 1–7 nucleotides
    g4_pattern = r'(G{3,})[ATCGN]{1,7}(G{3,})[ATCGN]{1,7}(G{3,})[ATCGN]{1,7}(G{3,})'

    # Find all matching patterns using regular expressions
    matches = re.finditer(g4_pattern, sequence)

    # Store all matched G4 motifs
    results = []
    for match in matches:
        start = match.start()          # Start index (0-based)
        end = match.end()              # End index
        matched_seq = match.group()    # The actual sequence of the motif
        results.append({
            'start': start + 1,        # Convert to 1-based indexing
            'end': end,
            'sequence': matched_seq,
            'length': end - start
        })

    return results

# Function to get the reverse complement of a DNA sequence
def reverse_complement(seq):
    # Define base-pair complements
    complement = str.maketrans("ATCG", "TAGC")
    # Translate to complement and reverse the string
    return seq.translate(complement)[::-1]

# === MAIN SECTION ===

# 🔽 Paste your promoter sequence directly here (multi-line string allowed)
promoter_sequence = """
ATCGGGTGGTGAGGGTAGGGAGGGTAGCGGGTAGCGTAGCGTAGCTAGCGGGTTTGGGAGGGTAGGAGGGTGTT
"""

# Find G-quadruplexes on the forward strand
forward_hits = find_g_quadruplexes(promoter_sequence)

# Get the reverse complement of the sequence
reverse_seq = reverse_complement(promoter_sequence)

# Find G-quadruplexes on the reverse strand
reverse_hits = find_g_quadruplexes(reverse_seq)

# === OUTPUT RESULTS ===

# Print results from the forward strand
print(f"\n🔹 Forward Strand: Total G-quadruplex motifs found: {len(forward_hits)}")
for i, hit in enumerate(forward_hits, 1):
    print(f"G4-FWD-{i}: Start={hit['start']} End={hit['end']} Length={hit['length']}\nSequence: {hit['sequence']}\n")

# Print results from the reverse complement strand
print(f"\n🔹 Reverse Strand (complement): Total G-quadruplex motifs found: {len(reverse_hits)}")
for i, hit in enumerate(reverse_hits, 1):
    print(f"G4-REV-{i}: Start={hit['start']} End={hit['end']} Length={hit['length']}\nSequence: {hit['sequence']}\n")



🔹 Forward Strand: Total G-quadruplex motifs found: 2
G4-FWD-1: Start=4 End=31 Length=28
Sequence: GGGTGGTGAGGGTAGGGAGGGTAGCGGG

G4-FWD-2: Start=50 End=70 Length=21
Sequence: GGGTTTGGGAGGGTAGGAGGG


🔹 Reverse Strand (complement): Total G-quadruplex motifs found: 0


In [5]:
import re
from Bio import SeqIO

# Function to detect G-quadruplex motifs
def find_g_quadruplexes(sequence):
    sequence = re.sub(r'[^ATGCN]', '', sequence.upper())
    g4_pattern = r'(G{3,})[ATCGN]{1,7}(G{3,})[ATCGN]{1,7}(G{3,})[ATCGN]{1,7}(G{3,})'
    matches = re.finditer(g4_pattern, sequence)
    results = []
    for match in matches:
        start = match.start()
        end = match.end()
        matched_seq = match.group()
        results.append({
            'start': start + 1,
            'end': end,
            'sequence': matched_seq,
            'length': end - start
        })
    return results

# Function to generate reverse complement
def reverse_complement(seq):
    complement = str.maketrans("ATCG", "TAGC")
    return seq.translate(complement)[::-1]

# Load sequences from a .txt or .fasta file
def load_sequences(file_path):
    return list(SeqIO.parse(file_path, "fasta"))

# === MAIN CODE ===

# Provide your .txt file (FASTA format)
file_path = "R:\HDD Rishikesh(WD 1TB)\Dr. Rashmi\H_Sapiens_raw.txt/H_Sapiens_raw.txt"  # Replace with your actual file name

# Read all promoter sequences
records = load_sequences(file_path)

# Process each promoter
for record in records:
    print(f"\n🧬 Analyzing: {record.id}")
    sequence = str(record.seq)

    # Forward strand
    forward_hits = find_g_quadruplexes(sequence)
    print(f"🔹 Forward Strand: {len(forward_hits)} G-quadruplex(es) found")
    for i, hit in enumerate(forward_hits, 1):
        print(f"  FWD-{i}: Start={hit['start']} End={hit['end']} Length={hit['length']}\n  Seq: {hit['sequence']}")

    # Reverse strand
    reverse_seq = reverse_complement(sequence)
    reverse_hits = find_g_quadruplexes(reverse_seq)
    print(f"🔸 Reverse Strand: {len(reverse_hits)} G-quadruplex(es) found")
    for i, hit in enumerate(reverse_hits, 1):
        print(f"  REV-{i}: Start={hit['start']} End={hit['end']} Length={hit['length']}\n  Seq: {hit['sequence']}")



🧬 Analyzing: FP000621
🔹 Forward Strand: 1 G-quadruplex(es) found
  FWD-1: Start=749 End=780 Length=32
  Seq: GGGGCGGAAGGGGTTCGGGGTGGCTGAGGGGG
🔸 Reverse Strand: 0 G-quadruplex(es) found

🧬 Analyzing: FP009794
🔹 Forward Strand: 2 G-quadruplex(es) found
  FWD-1: Start=78 End=102 Length=25
  Seq: GGGAAAGCGGGGTGGGGATGTGGGG
  FWD-2: Start=894 End=922 Length=29
  Seq: GGGGGTTGCGGGAGACGGGGAGGTTGGGG
🔸 Reverse Strand: 0 G-quadruplex(es) found

🧬 Analyzing: FP003724
🔹 Forward Strand: 1 G-quadruplex(es) found
  FWD-1: Start=756 End=778 Length=23
  Seq: GGGGCTTGGGGTGGGCTTCAGGG
🔸 Reverse Strand: 1 G-quadruplex(es) found
  REV-1: Start=729 End=760 Length=32
  Seq: GGGGGGGCTTAGGGACCAGCGGGAGGCGGGGG

🧬 Analyzing: FP004791
🔹 Forward Strand: 2 G-quadruplex(es) found
  FWD-1: Start=158 End=174 Length=17
  Seq: GGGAGGGAGGGCAGGGG
  FWD-2: Start=670 End=701 Length=32
  Seq: GGGGAGAGCACGGGACCCGGTGGGGGAGGGGG
🔸 Reverse Strand: 3 G-quadruplex(es) found
  REV-1: Start=162 End=187 Length=26
  Seq: GGGGCGGGGGGGGGGG

In [2]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [4]:
from google.colab import files

# Prompt you to upload file manually
uploaded = files.upload("R:\HDD Rishikesh(WD 1TB)\Dr. Rashmi\H_Sapiens_raw.txt")


Saving H_Sapiens_raw.txt to R:\HDD Rishikesh(WD 1TB)\Dr. Rashmi\H_Sapiens_raw.txt/H_Sapiens_raw.txt
