<a href="https://colab.research.google.com/github/sewali-art/peptigram/blob/main/Sewali_Pepti.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

from google.colab import files
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from Bio import SeqIO
import re
from IPython.display import display, HTML
import ipywidgets as widgets
from io import StringIO
import json

# Create sample FASTA file
sample_fasta = """>sp|P01308|INS_HUMAN Insulin OS=Homo sapiens OX=9606 GN=INS PE=1 SV=1
MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAED
LQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN
>sp|P01317|INS_MOUSE Insulin OS=Mus musculus OX=10090 GN=Ins2 PE=1 SV=1
MALWIRSLPLLALLVLWEPKPTQAFVKQHLCGPHLVEALYLVCGERGFFYTPMSRREVED
PQVAQLELGGGPGAGDLQTLALEVAQQKRGIVDQCCTSICSLYQLENYCN"""

# Write sample FASTA file
with open('sample_proteins.fasta', 'w') as f:
    f.write(sample_fasta)

# Create sample CSV file
sample_csv = """UniProt id,Peptide,Start position,End position,Intensity
P01308,FVNQHLCGSHLVEAL,15,30,1000
P01308,GERGFFYTPK,35,45,800
P01317,FVKQHLCGPHLVEAL,15,30,1200
P01317,GERGFFYTPM,35,45,900"""

# Write sample CSV file
with open('sample_peptides.csv', 'w') as f:
    f.write(sample_csv)

print("Sample files created: 'sample_proteins.fasta' and 'sample_peptides.csv'")
print("\nYou can download these files and then upload them back to test the system.")
print("\nOr you can use your own FASTA and CSV files with similar format.")

class PeptigramAnalyzer:
    def __init__(self):
        self.protein_sequences = {}
        self.peptide_data = None
        self.coverage_data = {}
        self.plot = None

    def load_fasta_file(self, fasta_file):
        """Load protein sequences from FASTA file"""
        self.protein_sequences = {}
        for record in SeqIO.parse(fasta_file, "fasta"):
            self.protein_sequences[record.id] = str(record.seq)
        return list(self.protein_sequences.keys())

    def load_peptides_file(self, csv_file):
        """Load peptide data from CSV file"""
        self.peptide_data = pd.read_csv(csv_file)
        required_cols = ['UniProt id', 'Peptide', 'Start position', 'End position']
        if not all(col in self.peptide_data.columns for col in required_cols):
            raise ValueError(f"CSV must contain columns: {required_cols}")

    def generate_plot(self, selected_proteins=None):
        """Generate coverage plot for selected proteins"""
        if selected_proteins is None:
            selected_proteins = list(self.protein_sequences.keys())

        fig = go.Figure()
        y_offset = 0

        for protein_id in selected_proteins:
            if protein_id not in self.protein_sequences:
                continue

            protein_seq = self.protein_sequences[protein_id]
            protein_length = len(protein_seq)
            coverage = np.zeros(protein_length)

            # Filter peptides for this protein
            protein_peptides = self.peptide_data[
                self.peptide_data['UniProt id'] == protein_id
            ]

            # Calculate coverage
            for _, row in protein_peptides.iterrows():
                start = int(row['Start position'])
                end = int(row['End position'])
                intensity = float(row['Intensity'])
                coverage[start:end] += intensity

                # Add peptide marker
                fig.add_trace(go.Scatter(
                    x=[(start + end) / 2],
                    y=[y_offset + coverage[start:end].max() + 0.1],
                    mode='markers',
                    marker=dict(size=8, color='red'),
                    name=f"{protein_id} Peptide",
                    text=f"Peptide: {row['Peptide']}<br>Position: {start}-{end}<br>Intensity: {intensity:.2f}",
                    hoverinfo='text',
                    showlegend=False
                ))

            # Add coverage trace
            fig.add_trace(go.Bar(
                x=list(range(protein_length)),
                y=coverage + y_offset,
                name=f"{protein_id} Coverage",
                hovertemplate="Position: %{x}<br>Coverage: %{y:.2f}<extra></extra>"
            ))

            y_offset += coverage.max() + 1

        fig.update_layout(
            title="Peptide Coverage Map",
            xaxis_title="Protein Position",
            yaxis_title="Coverage Depth",
            showlegend=True,
            hovermode='closest',
            template='plotly_white',
            height=200 + (300 * len(selected_proteins))
        )

        self.plot = fig
        return fig

# Create Peptigram instance
analyzer = PeptigramAnalyzer()

# Function to handle file uploads
def process_files():
    print("Please upload your FASTA file...")
    uploaded_fasta = files.upload()
    fasta_file = list(uploaded_fasta.keys())[0]

    print("\nPlease upload your peptides CSV file...")
    uploaded_csv = files.upload()
    csv_file = list(uploaded_csv.keys())[0]

    # Load files
    print("\nProcessing files...")
    proteins = analyzer.load_fasta_file(fasta_file)
    analyzer.load_peptides_file(csv_file)

    # Create protein selection widget
    protein_select = widgets.SelectMultiple(
        options=proteins,
        description='Select proteins:',
        layout={'width': 'max-content'}
    )

    # Create plot button
    plot_button = widgets.Button(
        description='Generate Plot',
        button_style='primary'
    )

    def on_plot_click(b):
        if not protein_select.value:
            print("Please select at least one protein")
            return
        fig = analyzer.generate_plot(protein_select.value)
        fig.show()

    plot_button.on_click(on_plot_click)

    # Display widgets
    display(widgets.VBox([protein_select, plot_button]))

# Run the interface
print("First, download the sample files if you want to test the system.")
print("Then, run process_files() to start the upload process.")
print("\nTo begin, type: process_files()")

ModuleNotFoundError: No module named 'Bio'

In [4]:
# Install required packages
!pip install biopython plotly pandas numpy ipywidgets

from google.colab import files
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from Bio import SeqIO
import re
from IPython.display import display, HTML
import ipywidgets as widgets
from io import StringIO
import json

# Create sample FASTA file
sample_fasta = """>sp|P01308|INS_HUMAN Insulin OS=Homo sapiens OX=9606 GN=INS PE=1 SV=1
MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAED
LQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN
>sp|P01317|INS_MOUSE Insulin OS=Mus musculus OX=10090 GN=Ins2 PE=1 SV=1
MALWIRSLPLLALLVLWEPKPTQAFVKQHLCGPHLVEALYLVCGERGFFYTPMSRREVED
PQVAQLELGGGPGAGDLQTLALEVAQQKRGIVDQCCTSICSLYQLENYCN"""

# Write sample FASTA file
with open('sample_proteins.fasta', 'w') as f:
    f.write(sample_fasta)

# Create sample CSV file
sample_csv = """UniProt id,Peptide,Start position,End position,Intensity
P01308,FVNQHLCGSHLVEAL,15,30,1000
P01308,GERGFFYTPK,35,45,800
P01317,FVKQHLCGPHLVEAL,15,30,1200
P01317,GERGFFYTPM,35,45,900"""

# Write sample CSV file
with open('sample_peptides.csv', 'w') as f:
    f.write(sample_csv)

print("Sample files created: 'sample_proteins.fasta' and 'sample_peptides.csv'")
print("\nYou can download these files and then upload them back to test the system.")
print("\nOr you can use your own FASTA and CSV files with similar format.")

class PeptigramAnalyzer:
    def __init__(self):
        self.protein_sequences = {}
        self.peptide_data = None
        self.coverage_data = {}
        self.plot = None

    def load_fasta_file(self, fasta_file):
        """Load protein sequences from FASTA file"""
        self.protein_sequences = {}
        for record in SeqIO.parse(fasta_file, "fasta"):
            self.protein_sequences[record.id] = str(record.seq)
        return list(self.protein_sequences.keys())

    def load_peptides_file(self, csv_file):
        """Load peptide data from CSV file"""
        self.peptide_data = pd.read_csv(csv_file)
        required_cols = ['UniProt id', 'Peptide', 'Start position', 'End position']
        if not all(col in self.peptide_data.columns for col in required_cols):
            raise ValueError(f"CSV must contain columns: {required_cols}")

    def generate_plot(self, selected_proteins=None):
        """Generate coverage plot for selected proteins"""
        if selected_proteins is None:
            selected_proteins = list(self.protein_sequences.keys())

        fig = go.Figure()
        y_offset = 0

        for protein_id in selected_proteins:
            if protein_id not in self.protein_sequences:
                continue

            protein_seq = self.protein_sequences[protein_id]
            protein_length = len(protein_seq)
            coverage = np.zeros(protein_length)

            # Filter peptides for this protein
            protein_peptides = self.peptide_data[
                self.peptide_data['UniProt id'] == protein_id
            ]

            # Calculate coverage
            for _, row in protein_peptides.iterrows():
                start = int(row['Start position'])
                end = int(row['End position'])
                intensity = float(row['Intensity'])
                coverage[start:end] += intensity

                # Add peptide marker
                fig.add_trace(go.Scatter(
                    x=[(start + end) / 2],
                    y=[y_offset + coverage[start:end].max() + 0.1],
                    mode='markers',
                    marker=dict(size=8, color='red'),
                    name=f"{protein_id} Peptide",
                    text=f"Peptide: {row['Peptide']}<br>Position: {start}-{end}<br>Intensity: {intensity:.2f}",
                    hoverinfo='text',
                    showlegend=False
                ))

            # Add coverage trace
            fig.add_trace(go.Bar(
                x=list(range(protein_length)),
                y=coverage + y_offset,
                name=f"{protein_id} Coverage",
                hovertemplate="Position: %{x}<br>Coverage: %{y:.2f}<extra></extra>"
            ))

            y_offset += coverage.max() + 1

        fig.update_layout(
            title="Peptide Coverage Map",
            xaxis_title="Protein Position",
            yaxis_title="Coverage Depth",
            showlegend=True,
            hovermode='closest',
            template='plotly_white',
            height=200 + (300 * len(selected_proteins))
        )

        self.plot = fig
        return fig

# Create Peptigram instance
analyzer = PeptigramAnalyzer()

# Function to handle file uploads
def process_files():
    print("Please upload your FASTA file...")
    uploaded_fasta = files.upload()
    fasta_file = list(uploaded_fasta.keys())[0]

    print("\nPlease upload your peptides CSV file...")
    uploaded_csv = files.upload()
    csv_file = list(uploaded_csv.keys())[0]

    # Load files
    print("\nProcessing files...")
    proteins = analyzer.load_fasta_file(fasta_file)
    analyzer.load_peptides_file(csv_file)

    # Create protein selection widget
    protein_select = widgets.SelectMultiple(
        options=proteins,
        description='Select proteins:',
        layout={'width': 'max-content'}
    )

    # Create plot button
    plot_button = widgets.Button(
        description='Generate Plot',
        button_style='primary'
    )

    def on_plot_click(b):
        if not protein_select.value:
            print("Please select at least one protein")
            return
        fig = analyzer.generate_plot(protein_select.value)
        fig.show()

    plot_button.on_click(on_plot_click)

    # Display widgets
    display(widgets.VBox([protein_select, plot_button]))

# Run the interface
print("First, download the sample files if you want to test the system.")
print("Then, run process_files() to start the upload process.")
print("\nTo begin, type: process_files()")

Sample files created: 'sample_proteins.fasta' and 'sample_peptides.csv'

You can download these files and then upload them back to test the system.

Or you can use your own FASTA and CSV files with similar format.
First, download the sample files if you want to test the system.
Then, run process_files() to start the upload process.

To begin, type: process_files()


In [6]:
type: process_files()


Please upload your FASTA file...


Saving P35527.fasta.txt to P35527.fasta.txt

Please upload your peptides CSV file...


Saving peptigram_test_file (1).csv to peptigram_test_file (1) (1).csv

Processing files...


VBox(children=(SelectMultiple(description='Select proteins:', layout=Layout(width='max-content'), options=('sp…

In [7]:


from google.colab import files
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from Bio import SeqIO
import re
from IPython.display import display, HTML
import ipywidgets as widgets
from io import StringIO
import json
import logging
import sys
from typing import Dict, List, Tuple, Optional

class PeptigramAnalyzer:
    def __init__(self):
        self.protein_sequences = {}
        self.peptide_data = None

    def load_fasta_file(self, fasta_file):
        """Load protein sequences from FASTA file with debug info"""
        try:
            content = fasta_file.read().decode('utf-8')
            print("\nDebug - FASTA content first 100 chars:", content[:100])

            for record in SeqIO.parse(StringIO(content), "fasta"):
                # Extract UniProt ID from header
                if '|' in record.id:
                    uniprot_id = record.id.split('|')[1]
                else:
                    uniprot_id = record.id

                self.protein_sequences[uniprot_id] = str(record.seq)
                print(f"Debug - Loaded protein: {uniprot_id}, Length: {len(str(record.seq))}")

            print("\nDebug - Loaded proteins:", list(self.protein_sequences.keys()))
            return list(self.protein_sequences.keys())

        except Exception as e:
            print(f"Debug - Error in load_fasta_file: {str(e)}")
            raise

    def load_peptides_file(self, csv_file):
        """Load peptide data from CSV file with debug info"""
        try:
            content = csv_file.read().decode('utf-8')
            print("\nDebug - CSV content first 100 chars:", content[:100])

            self.peptide_data = pd.read_csv(StringIO(content))
            print("\nDebug - CSV columns:", list(self.peptide_data.columns))
            print("Debug - First few rows of peptide data:")
            print(self.peptide_data.head())

            # Clean up column names
            self.peptide_data.columns = self.peptide_data.columns.str.strip()

            # Print unique protein IDs in peptide data
            unique_ids = self.peptide_data['UniProt id'].unique()
            print("\nDebug - Unique protein IDs in peptide data:", list(unique_ids))

        except Exception as e:
            print(f"Debug - Error in load_peptides_file: {str(e)}")
            raise

    def generate_plot(self, selected_proteins):
        """Generate coverage plot with detailed debug info"""
        try:
            if not selected_proteins:
                print("Debug - No proteins selected")
                return

            print(f"\nDebug - Generating plot for proteins: {selected_proteins}")

            fig = go.Figure()
            y_offset = 0

            for protein_id in selected_proteins:
                print(f"\nDebug - Processing protein: {protein_id}")

                if protein_id not in self.protein_sequences:
                    print(f"Debug - Protein {protein_id} not found in FASTA data")
                    print(f"Debug - Available proteins: {list(self.protein_sequences.keys())}")
                    continue

                protein_seq = self.protein_sequences[protein_id]
                protein_length = len(protein_seq)
                coverage = np.zeros(protein_length)

                # Filter peptides for this protein
                protein_peptides = self.peptide_data[
                    self.peptide_data['UniProt id'] == protein_id
                ]

                print(f"Debug - Found {len(protein_peptides)} peptides for {protein_id}")

                if len(protein_peptides) == 0:
                    print(f"Debug - No peptides found for protein {protein_id}")
                    continue

                # Process peptides
                for idx, row in protein_peptides.iterrows():
                    try:
                        start = int(row['Start position'])
                        end = int(row['End position'])
                        intensity = 1.0  # Default intensity

                        if 'Intensity' in row:
                            intensity = float(row['Intensity'])

                        print(f"Debug - Processing peptide {idx}: {start}-{end}")

                        if start < 0 or end > protein_length:
                            print(f"Debug - Invalid positions: {start}-{end} for protein length {protein_length}")
                            continue

                        coverage[start:end] += intensity

                        # Add peptide marker
                        fig.add_trace(go.Scatter(
                            x=[(start + end) / 2],
                            y=[y_offset + coverage[start:end].max() + 0.1],
                            mode='markers',
                            marker=dict(size=8, color='red'),
                            name=f"{protein_id} Peptide",
                            text=f"Peptide: {row['Peptide']}<br>Position: {start}-{end}",
                            hoverinfo='text',
                            showlegend=False
                        ))

                    except Exception as e:
                        print(f"Debug - Error processing peptide: {str(e)}")
                        continue

                # Add coverage trace
                fig.add_trace(go.Bar(
                    x=list(range(protein_length)),
                    y=coverage + y_offset,
                    name=f"{protein_id} Coverage"
                ))

                y_offset += max(coverage) + 1

            if len(fig.data) == 0:
                print("Debug - No data to plot")
                return

            fig.update_layout(
                title="Peptide Coverage Map",
                xaxis_title="Protein Position",
                yaxis_title="Coverage Depth",
                height=400
            )

            return fig

        except Exception as e:
            print(f"Debug - Error in generate_plot: {str(e)}")
            raise

def process_files():
    """Process files with debug output"""
    analyzer = PeptigramAnalyzer()

    print("Please upload your FASTA file...")
    uploaded_fasta = files.upload()
    fasta_file = list(uploaded_fasta.keys())[0]

    print("\nPlease upload your peptides CSV file...")
    uploaded_csv = files.upload()
    csv_file = list(uploaded_csv.keys())[0]

    try:
        # Load files
        print("\nLoading FASTA file...")
        proteins = analyzer.load_fasta_file(uploaded_fasta[fasta_file])

        print("\nLoading peptide data...")
        analyzer.load_peptides_file(uploaded_csv[csv_file])

        # Create selection widget
        protein_select = widgets.SelectMultiple(
            options=proteins,
            description='Select proteins:',
            layout={'width': 'max-content'}
        )

        def on_plot_click(b):
            if not protein_select.value:
                print("Please select at least one protein")
                return

            print("\nGenerating plot...")
            try:
                fig = analyzer.generate_plot(protein_select.value)
                if fig is not None:
                    fig.show()
                else:
                    print("No plot generated - check debug output above")
            except Exception as e:
                print(f"Error generating plot: {str(e)}")

        plot_button = widgets.Button(
            description='Generate Plot',
            button_style='primary'
        )
        plot_button.on_click(on_plot_click)

        display(widgets.VBox([
            widgets.HTML("<b>Select proteins to visualize:</b>"),
            protein_select,
            plot_button
        ]))

    except Exception as e:
        print(f"Error processing files: {str(e)}")

print("To begin, type: process_files()")
print("\nMake sure your files follow this format:")
print("FASTA: >sp|UniProtID|Name")
print("CSV: Must have columns: 'UniProt id', 'Peptide', 'Start position', 'End position'")

To begin, type: process_files()

Make sure your files follow this format:
FASTA: >sp|UniProtID|Name
CSV: Must have columns: 'UniProt id', 'Peptide', 'Start position', 'End position'


In [8]:
process_files()

Please upload your FASTA file...


Saving P35527.fasta.txt to P35527.fasta (1).txt

Please upload your peptides CSV file...


Saving peptigram_test_file (1).csv to peptigram_test_file (1) (2).csv

Loading FASTA file...
Debug - Error in load_fasta_file: 'bytes' object has no attribute 'read'
Error processing files: 'bytes' object has no attribute 'read'


In [9]:

from google.colab import files
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from Bio import SeqIO
import re
from IPython.display import display, HTML
import ipywidgets as widgets
from io import StringIO, BytesIO
import json

class PeptigramAnalyzer:
    def __init__(self):
        self.protein_sequences = {}
        self.peptide_data = None

    def load_fasta_file(self, fasta_content):
        """Load protein sequences from FASTA content"""
        try:
            # Convert bytes to string if necessary
            if isinstance(fasta_content, bytes):
                fasta_content = fasta_content.decode('utf-8')

            print("\nDebug - FASTA content first 100 chars:", fasta_content[:100])

            # Parse FASTA using StringIO
            fasta_handle = StringIO(fasta_content)
            for record in SeqIO.parse(fasta_handle, "fasta"):
                # Extract UniProt ID from header
                if '|' in record.id:
                    uniprot_id = record.id.split('|')[1]
                else:
                    uniprot_id = record.id

                self.protein_sequences[uniprot_id] = str(record.seq)
                print(f"Debug - Loaded protein: {uniprot_id}, Length: {len(str(record.seq))}")

            print("\nDebug - Loaded proteins:", list(self.protein_sequences.keys()))
            return list(self.protein_sequences.keys())

        except Exception as e:
            print(f"Debug - Error in load_fasta_file: {str(e)}")
            raise

    def load_peptides_file(self, csv_content):
        """Load peptide data from CSV content"""
        try:
            # Convert bytes to string if necessary
            if isinstance(csv_content, bytes):
                csv_content = csv_content.decode('utf-8')

            print("\nDebug - CSV content first 100 chars:", csv_content[:100])

            # Parse CSV using StringIO
            self.peptide_data = pd.read_csv(StringIO(csv_content))
            print("\nDebug - CSV columns:", list(self.peptide_data.columns))
            print("Debug - First few rows of peptide data:")
            print(self.peptide_data.head())

            # Clean up column names
            self.peptide_data.columns = self.peptide_data.columns.str.strip()

            # Print unique protein IDs in peptide data
            unique_ids = self.peptide_data['UniProt id'].unique()
            print("\nDebug - Unique protein IDs in peptide data:", list(unique_ids))

        except Exception as e:
            print(f"Debug - Error in load_peptides_file: {str(e)}")
            raise

    def generate_plot(self, selected_proteins):
        """Generate coverage plot"""
        try:
            if not selected_proteins:
                print("Debug - No proteins selected")
                return

            print(f"\nDebug - Generating plot for proteins: {selected_proteins}")

            fig = go.Figure()
            y_offset = 0

            for protein_id in selected_proteins:
                print(f"\nDebug - Processing protein: {protein_id}")

                if protein_id not in self.protein_sequences:
                    print(f"Debug - Protein {protein_id} not found in FASTA data")
                    print(f"Debug - Available proteins: {list(self.protein_sequences.keys())}")
                    continue

                protein_seq = self.protein_sequences[protein_id]
                protein_length = len(protein_seq)
                coverage = np.zeros(protein_length)

                # Filter peptides for this protein
                protein_peptides = self.peptide_data[
                    self.peptide_data['UniProt id'] == protein_id
                ]

                print(f"Debug - Found {len(protein_peptides)} peptides for {protein_id}")

                if len(protein_peptides) == 0:
                    print(f"Debug - No peptides found for protein {protein_id}")
                    continue

                # Process peptides
                for idx, row in protein_peptides.iterrows():
                    try:
                        start = int(row['Start position'])
                        end = int(row['End position'])
                        intensity = 1.0  # Default intensity

                        if 'Intensity' in row:
                            intensity = float(row['Intensity'])

                        print(f"Debug - Processing peptide {idx}: {start}-{end}")

                        if start < 0 or end > protein_length:
                            print(f"Debug - Invalid positions: {start}-{end} for protein length {protein_length}")
                            continue

                        coverage[start:end] += intensity

                        # Add peptide marker
                        fig.add_trace(go.Scatter(
                            x=[(start + end) / 2],
                            y=[y_offset + coverage[start:end].max() + 0.1],
                            mode='markers',
                            marker=dict(size=8, color='red'),
                            name=f"{protein_id} Peptide",
                            text=f"Peptide: {row['Peptide']}<br>Position: {start}-{end}",
                            hoverinfo='text',
                            showlegend=False
                        ))

                    except Exception as e:
                        print(f"Debug - Error processing peptide: {str(e)}")
                        continue

                # Add coverage trace
                fig.add_trace(go.Bar(
                    x=list(range(protein_length)),
                    y=coverage + y_offset,
                    name=f"{protein_id} Coverage"
                ))

                y_offset += max(coverage) + 1

            if len(fig.data) == 0:
                print("Debug - No data to plot")
                return

            fig.update_layout(
                title="Peptide Coverage Map",
                xaxis_title="Protein Position",
                yaxis_title="Coverage Depth",
                height=400
            )

            return fig

        except Exception as e:
            print(f"Debug - Error in generate_plot: {str(e)}")
            raise

def process_files():
    """Process files with fixed file handling"""
    analyzer = PeptigramAnalyzer()

    print("Please upload your FASTA file...")
    uploaded_fasta = files.upload()

    if not uploaded_fasta:
        print("No FASTA file uploaded")
        return

    print("\nPlease upload your peptides CSV file...")
    uploaded_csv = files.upload()

    if not uploaded_csv:
        print("No CSV file uploaded")
        return

    try:
        # Get the first uploaded file's content
        fasta_content = list(uploaded_fasta.values())[0]
        csv_content = list(uploaded_csv.values())[0]

        # Load files
        print("\nLoading FASTA file...")
        proteins = analyzer.load_fasta_file(fasta_content)

        print("\nLoading peptide data...")
        analyzer.load_peptides_file(csv_content)

        # Create selection widget
        protein_select = widgets.SelectMultiple(
            options=proteins,
            description='Select proteins:',
            layout={'width': 'max-content'}
        )

        def on_plot_click(b):
            if not protein_select.value:
                print("Please select at least one protein")
                return

            print("\nGenerating plot...")
            try:
                fig = analyzer.generate_plot(protein_select.value)
                if fig is not None:
                    fig.show()
                else:
                    print("No plot generated - check debug output above")
            except Exception as e:
                print(f"Error generating plot: {str(e)}")

        plot_button = widgets.Button(
            description='Generate Plot',
            button_style='primary'
        )
        plot_button.on_click(on_plot_click)

        display(widgets.VBox([
            widgets.HTML("<b>Select proteins to visualize:</b>"),
            protein_select,
            plot_button
        ]))

    except Exception as e:
        print(f"Error processing files: {str(e)}")

# Create sample files for testing
sample_fasta = """>sp|P01308|INS_HUMAN Insulin OS=Homo sapiens
MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAED
LQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN"""

sample_csv = """UniProt id,Peptide,Start position,End position,Intensity
P01308,FVNQHLCGSHLVEAL,15,30,1000
P01308,GERGFFYTPK,35,45,800"""

print("To begin, type: process_files()")
print("\nMake sure your files follow this format:")
print("\nFASTA format example:")
print(sample_fasta[:100] + "...")
print("\nCSV format example:")
print(sample_csv)

To begin, type: process_files()

Make sure your files follow this format:

FASTA format example:
>sp|P01308|INS_HUMAN Insulin OS=Homo sapiens
MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTR...

CSV format example:
UniProt id,Peptide,Start position,End position,Intensity
P01308,FVNQHLCGSHLVEAL,15,30,1000
P01308,GERGFFYTPK,35,45,800


In [10]:
process_files()


Please upload your FASTA file...


Saving P35527.fasta.txt to P35527.fasta (2).txt

Please upload your peptides CSV file...


Saving SmallP.xlsx to SmallP.xlsx

Loading FASTA file...

Debug - FASTA content first 100 chars: >sp|P35527|K1C9_HUMAN Keratin, type I cytoskeletal 9 OS=Homo sapiens OX=9606 GN=KRT9 PE=1 SV=3
MSCRQ
Debug - Loaded protein: P35527, Length: 623

Debug - Loaded proteins: ['P35527']

Loading peptide data...
Debug - Error in load_peptides_file: 'utf-8' codec can't decode bytes in position 15-16: invalid continuation byte
Error processing files: 'utf-8' codec can't decode bytes in position 15-16: invalid continuation byte


In [11]:

from google.colab import files
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from Bio import SeqIO
import re
from IPython.display import display, HTML
import ipywidgets as widgets
from io import StringIO, BytesIO
import chardet
import json

class PeptigramAnalyzer:
    def __init__(self):
        self.protein_sequences = {}
        self.peptide_data = None

    def load_fasta_file(self, fasta_content):
        """Load protein sequences from FASTA content"""
        try:
            # Convert bytes to string if necessary
            if isinstance(fasta_content, bytes):
                fasta_content = fasta_content.decode('utf-8')

            print("\nDebug - FASTA content first 100 chars:", fasta_content[:100])

            # Parse FASTA using StringIO
            fasta_handle = StringIO(fasta_content)
            for record in SeqIO.parse(fasta_handle, "fasta"):
                # Extract UniProt ID from header
                if '|' in record.id:
                    uniprot_id = record.id.split('|')[1]
                else:
                    uniprot_id = record.id

                self.protein_sequences[uniprot_id] = str(record.seq)
                print(f"Debug - Loaded protein: {uniprot_id}, Length: {len(str(record.seq))}")

            print("\nDebug - Loaded proteins:", list(self.protein_sequences.keys()))
            return list(self.protein_sequences.keys())

        except Exception as e:
            print(f"Debug - Error in load_fasta_file: {str(e)}")
            raise

    def load_peptides_file(self, csv_content):
        """Load peptide data from CSV content with enhanced encoding detection"""
        try:
            # Detect the file encoding
            result = chardet.detect(csv_content)
            encoding = result['encoding']
            confidence = result['confidence']
            print(f"\nDebug - Detected CSV encoding: {encoding} (confidence: {confidence})")

            # Try different encoding approaches if needed
            try:
                # First try the detected encoding
                csv_text = csv_content.decode(encoding)
            except UnicodeDecodeError:
                print("Debug - Failed with detected encoding, trying common alternatives...")
                encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
                for enc in encodings:
                    try:
                        csv_text = csv_content.decode(enc)
                        print(f"Debug - Successfully decoded with {enc}")
                        break
                    except UnicodeDecodeError:
                        continue
                else:
                    raise ValueError("Could not decode CSV file with any common encoding")

            print("\nDebug - CSV content first 100 chars:", csv_text[:100])

            # Remove any BOM if present
            if csv_text.startswith('\ufeff'):
                csv_text = csv_text[1:]

            # Parse CSV
            self.peptide_data = pd.read_csv(StringIO(csv_text))

            # Clean up column names
            self.peptide_data.columns = self.peptide_data.columns.str.strip()

            # Verify required columns
            required_cols = ['UniProt id', 'Peptide', 'Start position', 'End position']
            missing_cols = [col for col in required_cols if col not in self.peptide_data.columns]
            if missing_cols:
                print("Debug - Found columns:", list(self.peptide_data.columns))
                raise ValueError(f"Missing required columns: {missing_cols}")

            print("\nDebug - CSV columns:", list(self.peptide_data.columns))
            print("Debug - First few rows of peptide data:")
            print(self.peptide_data.head())

            # Clean up data
            self.peptide_data['UniProt id'] = self.peptide_data['UniProt id'].str.strip()

            # Print unique protein IDs in peptide data
            unique_ids = self.peptide_data['UniProt id'].unique()
            print("\nDebug - Unique protein IDs in peptide data:", list(unique_ids))

            # Verify protein ID matching
            matching_ids = set(unique_ids) & set(self.protein_sequences.keys())
            missing_ids = set(unique_ids) - set(self.protein_sequences.keys())

            if matching_ids:
                print(f"Debug - Found matching proteins: {list(matching_ids)}")
            if missing_ids:
                print(f"Debug - Warning: Some proteins in CSV not found in FASTA: {list(missing_ids)}")

        except Exception as e:
            print(f"Debug - Error in load_peptides_file: {str(e)}")
            raise

    def generate_plot(self, selected_proteins):
        """Generate coverage plot"""
        if not selected_proteins:
            print("Debug - No proteins selected")
            return

        print(f"\nDebug - Generating plot for proteins: {selected_proteins}")

        fig = go.Figure()
        y_offset = 0

        for protein_id in selected_proteins:
            if protein_id not in self.protein_sequences:
                print(f"Debug - Protein {protein_id} not found in FASTA data")
                continue

            protein_seq = self.protein_sequences[protein_id]
            protein_length = len(protein_seq)
            coverage = np.zeros(protein_length)

            # Filter peptides for this protein
            protein_peptides = self.peptide_data[
                self.peptide_data['UniProt id'] == protein_id
            ]

            if len(protein_peptides) == 0:
                print(f"Debug - No peptides found for protein {protein_id}")
                continue

            # Process peptides
            for idx, row in protein_peptides.iterrows():
                try:
                    start = int(row['Start position'])
                    end = int(row['End position'])

                    # Validate positions
                    if start < 0 or end > protein_length:
                        print(f"Debug - Invalid positions: {start}-{end} for protein length {protein_length}")
                        continue

                    intensity = 1.0
                    if 'Intensity' in row:
                        try:
                            intensity = float(row['Intensity'])
                        except ValueError:
                            print(f"Debug - Invalid intensity value for peptide {idx}, using default")

                    coverage[start:end] += intensity

                    # Add peptide marker
                    fig.add_trace(go.Scatter(
                        x=[(start + end) / 2],
                        y=[y_offset + coverage[start:end].max() + 0.1],
                        mode='markers',
                        marker=dict(size=8, color='red'),
                        name=f"{protein_id} Peptide",
                        text=f"Peptide: {row['Peptide']}<br>Position: {start}-{end}",
                        hoverinfo='text',
                        showlegend=False
                    ))
                except Exception as e:
                    print(f"Debug - Error processing peptide {idx}: {str(e)}")
                    continue

            # Add coverage trace
            fig.add_trace(go.Bar(
                x=list(range(protein_length)),
                y=coverage + y_offset,
                name=f"{protein_id} Coverage"
            ))

            y_offset += max(coverage) + 1

        if len(fig.data) == 0:
            print("Debug - No data to plot")
            return

        fig.update_layout(
            title="Peptide Coverage Map",
            xaxis_title="Protein Position",
            yaxis_title="Coverage Depth",
            height=400
        )

        return fig

def process_files():
    """Process files with enhanced encoding handling"""
    analyzer = PeptigramAnalyzer()

    print("Please upload your FASTA file...")
    uploaded_fasta = files.upload()

    if not uploaded_fasta:
        print("No FASTA file uploaded")
        return

    print("\nPlease upload your peptides CSV file...")
    uploaded_csv = files.upload()

    if not uploaded_csv:
        print("No CSV file uploaded")
        return

    try:
        # Get the first uploaded file's content
        fasta_content = list(uploaded_fasta.values())[0]
        csv_content = list(uploaded_csv.values())[0]

        # Load files
        print("\nLoading FASTA file...")
        proteins = analyzer.load_fasta_file(fasta_content)

        print("\nLoading peptide data...")
        analyzer.load_peptides_file(csv_content)

        # Create selection widget
        protein_select = widgets.SelectMultiple(
            options=proteins,
            description='Select proteins:',
            layout={'width': 'max-content'}
        )

        def on_plot_click(b):
            if not protein_select.value:
                print("Please select at least one protein")
                return

            print("\nGenerating plot...")
            try:
                fig = analyzer.generate_plot(protein_select.value)
                if fig is not None:
                    fig.show()
                else:
                    print("No plot generated - check debug output above")
            except Exception as e:
                print(f"Error generating plot: {str(e)}")

        plot_button = widgets.Button(
            description='Generate Plot',
            button_style='primary'
        )
        plot_button.on_click(on_plot_click)

        display(widgets.VBox([
            widgets.HTML("<b>Select proteins to visualize:</b>"),
            protein_select,
            plot_button
        ]))

    except Exception as e:
        print(f"Error processing files: {str(e)}")

print("To begin, type: process_files()")
print("\nMake sure your files follow this format:")
print("FASTA: >sp|UniProtID|Name")
print("CSV: Must have columns 'UniProt id', 'Peptide', 'Start position', 'End position'")

To begin, type: process_files()

Make sure your files follow this format:
FASTA: >sp|UniProtID|Name
CSV: Must have columns 'UniProt id', 'Peptide', 'Start position', 'End position'


In [12]:
 process_files()

Please upload your FASTA file...


Saving P35527.fasta.txt to P35527.fasta (3).txt

Please upload your peptides CSV file...


Saving SmallP.xlsx to SmallP (1).xlsx

Loading FASTA file...

Debug - FASTA content first 100 chars: >sp|P35527|K1C9_HUMAN Keratin, type I cytoskeletal 9 OS=Homo sapiens OX=9606 GN=KRT9 PE=1 SV=3
MSCRQ
Debug - Loaded protein: P35527, Length: 623

Debug - Loaded proteins: ['P35527']

Loading peptide data...

Debug - Detected CSV encoding: None (confidence: 0.0)
Debug - Error in load_peptides_file: decode() argument 'encoding' must be str, not None
Error processing files: decode() argument 'encoding' must be str, not None


In [13]:
# Install required packages
!pip install biopython plotly pandas numpy ipywidgets openpyxl

from google.colab import files
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from Bio import SeqIO
import re
from IPython.display import display, HTML
import ipywidgets as widgets
from io import StringIO, BytesIO

class PeptigramAnalyzer:
    def __init__(self):
        self.protein_sequences = {}
        self.peptide_data = None

    def load_fasta_file(self, fasta_content):
        """Load protein sequences from FASTA content"""
        try:
            if isinstance(fasta_content, bytes):
                fasta_content = fasta_content.decode('utf-8')

            print("\nDebug - FASTA content first 100 chars:", fasta_content[:100])

            fasta_handle = StringIO(fasta_content)
            for record in SeqIO.parse(fasta_handle, "fasta"):
                if '|' in record.id:
                    uniprot_id = record.id.split('|')[1]
                else:
                    uniprot_id = record.id

                self.protein_sequences[uniprot_id] = str(record.seq)
                print(f"Debug - Loaded protein: {uniprot_id}, Length: {len(str(record.seq))}")

            print("\nDebug - Loaded proteins:", list(self.protein_sequences.keys()))
            return list(self.protein_sequences.keys())

        except Exception as e:
            print(f"Debug - Error in load_fasta_file: {str(e)}")
            raise

    def load_excel_file(self, excel_content):
        """Load peptide data from Excel content"""
        try:
            excel_buffer = BytesIO(excel_content)

            # Read the Excel file
            xls = pd.ExcelFile(excel_buffer)
            print(f"\nDebug - Excel sheets found: {xls.sheet_names}")

            # Read the first sheet
            self.peptide_data = pd.read_excel(excel_buffer, sheet_name=0)

            # Clean up column names
            self.peptide_data.columns = self.peptide_data.columns.str.strip()
            print("\nDebug - Excel columns found:", list(self.peptide_data.columns))

            # Expected columns
            required_cols = ['uniprot id', 'peptide', 'start position', 'end position']
            intensity_cols = ['Intensity ArgC', 'Intensity LysC', 'Intensity ArgCLysC']

            # Convert column names to lowercase for matching
            col_map = {col: col.lower() for col in self.peptide_data.columns}
            self.peptide_data.columns = self.peptide_data.columns.str.lower()

            # Verify required columns
            missing_cols = [col for col in required_cols if col not in self.peptide_data.columns]
            if missing_cols:
                raise ValueError(f"Missing required columns: {missing_cols}")

            # Calculate average intensity if intensity columns exist
            existing_intensity_cols = [col for col in intensity_cols if col.lower() in self.peptide_data.columns]
            if existing_intensity_cols:
                print(f"\nDebug - Found intensity columns: {existing_intensity_cols}")
                self.peptide_data['average_intensity'] = self.peptide_data[
                    [col.lower() for col in existing_intensity_cols]
                ].mean(axis=1)
            else:
                print("\nDebug - No intensity columns found, using default intensity of 1.0")
                self.peptide_data['average_intensity'] = 1.0

            # Clean up UniProt IDs
            self.peptide_data['uniprot id'] = self.peptide_data['uniprot id'].astype(str).str.strip()

            # Convert positions to numeric
            self.peptide_data['start position'] = pd.to_numeric(self.peptide_data['start position'], errors='coerce')
            self.peptide_data['end position'] = pd.to_numeric(self.peptide_data['end position'], errors='coerce')

            # Remove rows with invalid positions
            original_rows = len(self.peptide_data)
            self.peptide_data = self.peptide_data.dropna(subset=['start position', 'end position'])
            if len(self.peptide_data) < original_rows:
                print(f"\nDebug - Removed {original_rows - len(self.peptide_data)} rows with invalid positions")

            # Print sample of processed data
            print("\nDebug - First few rows of processed data:")
            print(self.peptide_data.head())

            # Verify protein ID matching
            unique_ids = self.peptide_data['uniprot id'].unique()
            matching_ids = set(unique_ids) & set(self.protein_sequences.keys())
            missing_ids = set(unique_ids) - set(self.protein_sequences.keys())

            print(f"\nDebug - Found {len(matching_ids)} matching proteins")
            if missing_ids:
                print(f"Debug - Warning: {len(missing_ids)} proteins in Excel not found in FASTA: {list(missing_ids)}")

        except Exception as e:
            print(f"Debug - Error in load_excel_file: {str(e)}")
            raise

    def generate_plot(self, selected_proteins):
        """Generate coverage plot using average intensity"""
        if not selected_proteins:
            print("Debug - No proteins selected")
            return

        fig = go.Figure()
        y_offset = 0

        for protein_id in selected_proteins:
            if protein_id not in self.protein_sequences:
                print(f"Debug - Protein {protein_id} not found in FASTA data")
                continue

            protein_seq = self.protein_sequences[protein_id]
            protein_length = len(protein_seq)
            coverage = np.zeros(protein_length)

            # Filter peptides for this protein
            protein_peptides = self.peptide_data[
                self.peptide_data['uniprot id'] == protein_id
            ]

            print(f"Debug - Processing {len(protein_peptides)} peptides for {protein_id}")

            if len(protein_peptides) == 0:
                continue

            # Process peptides
            for idx, row in protein_peptides.iterrows():
                try:
                    start = int(row['start position'])
                    end = int(row['end position'])
                    intensity = float(row['average_intensity'])

                    if start < 0 or end > protein_length:
                        print(f"Debug - Invalid positions: {start}-{end} for protein length {protein_length}")
                        continue

                    coverage[start:end] += intensity

                    # Add peptide marker
                    fig.add_trace(go.Scatter(
                        x=[(start + end) / 2],
                        y=[y_offset + coverage[start:end].max() + 0.1],
                        mode='markers',
                        marker=dict(size=8, color='red'),
                        name=f"{protein_id} Peptide",
                        text=f"Peptide: {row['peptide']}<br>"
                             f"Position: {start}-{end}<br>"
                             f"Avg Intensity: {intensity:.2f}",
                        hoverinfo='text',
                        showlegend=False
                    ))

                except Exception as e:
                    print(f"Debug - Error processing peptide {idx}: {str(e)}")
                    continue

            # Add coverage trace
            fig.add_trace(go.Bar(
                x=list(range(protein_length)),
                y=coverage + y_offset,
                name=f"{protein_id}"
            ))

            y_offset += max(coverage) + 1

        if len(fig.data) == 0:
            print("Debug - No data to plot")
            return

        fig.update_layout(
            title="Peptide Coverage Map",
            xaxis_title="Protein Position",
            yaxis_title="Coverage Depth",
            height=200 + (300 * len(selected_proteins)),
            showlegend=True,
            legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="left",
                x=0.01
            )
        )

        return fig

def process_files():
    """Process files with support for multiple intensity columns"""
    analyzer = PeptigramAnalyzer()

    print("Please upload your FASTA file...")
    uploaded_fasta = files.upload()

    if not uploaded_fasta:
        print("No FASTA file uploaded")
        return

    print("\nPlease upload your Excel file...")
    uploaded_excel = files.upload()

    if not uploaded_excel:
        print("No Excel file uploaded")
        return

    try:
        # Load files
        fasta_content = list(uploaded_fasta.values())[0]
        excel_content = list(uploaded_excel.values())[0]

        print("\nLoading FASTA file...")
        proteins = analyzer.load_fasta_file(fasta_content)

        print("\nLoading Excel data...")
        analyzer.load_excel_file(excel_content)

        # Create selection widget
        protein_select = widgets.SelectMultiple(
            options=proteins,
            description='Select proteins:',
            layout={'width': 'max-content'}
        )

        def on_plot_click(b):
            if not protein_select.value:
                print("Please select at least one protein")
                return

            print("\nGenerating plot...")
            try:
                fig = analyzer.generate_plot(protein_select.value)
                if fig is not None:
                    fig.show()
                else:
                    print("No plot generated - check debug output above")
            except Exception as e:
                print(f"Error generating plot: {str(e)}")

        plot_button = widgets.Button(
            description='Generate Plot',
            button_style='primary'
        )
        plot_button.on_click(on_plot_click)

        display(widgets.VBox([
            widgets.HTML("<b>Select proteins to visualize:</b>"),
            protein_select,
            plot_button
        ]))

    except Exception as e:
        print(f"Error processing files: {str(e)}")

print("To begin, type: process_files()")
print("\nMake sure your Excel file has these columns:")
print("- uniprot id")
print("- peptide")
print("- start position")
print("- end position")
print("- Intensity ArgC (optional)")
print("- Intensity LysC (optional)")
print("- Intensity ArgCLysC (optional)")

To begin, type: process_files()

Make sure your Excel file has these columns:
- uniprot id
- peptide
- start position
- end position
- Intensity ArgC (optional)
- Intensity LysC (optional)
- Intensity ArgCLysC (optional)


In [14]:
 process_files()


Please upload your FASTA file...


Saving P35527.fasta.txt to P35527.fasta (4).txt

Please upload your Excel file...


Saving SmallP.xlsx to SmallP (2).xlsx

Loading FASTA file...

Debug - FASTA content first 100 chars: >sp|P35527|K1C9_HUMAN Keratin, type I cytoskeletal 9 OS=Homo sapiens OX=9606 GN=KRT9 PE=1 SV=3
MSCRQ
Debug - Loaded protein: P35527, Length: 623

Debug - Loaded proteins: ['P35527']

Loading Excel data...

Debug - Excel sheets found: ['Sheet1']

Debug - Excel columns found: ['UniProt id', 'Peptide', 'Start position', 'End position', 'Intensity ArgC', 'Intensity LysC', 'Intensity ArgCLysC']

Debug - Found intensity columns: ['Intensity ArgC', 'Intensity LysC', 'Intensity ArgCLysC']

Debug - First few rows of processed data:
  uniprot id            peptide  start position  end position  intensity argc  \
0     Q2KIX7         SADSELCGPR              33            42    8.054333e+05   
1     Q9TUM6             TLAIAR             310           315    9.253000e+05   
2     P35527  FSSSSGYGGGSSRVCGR              47            63    9.838000e+05   
3     P04264            LQITAGR             38

VBox(children=(HTML(value='<b>Select proteins to visualize:</b>'), SelectMultiple(description='Select proteins…

Please select at least one protein

Generating plot...
Debug - Processing 1 peptides for P35527
