In [None]:
from __future__ import print_function, division
import os
import pandas as pd # For csv
import numpy as np

import requests
import json

### Data set-up

Import drive, so that read and write access can be used with processed files, speeding up workflow.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Downloading data from Disprot website
- Disprot API had no clear way to download the full file of annotated disordered sequences.
- Custom function uses URL from inspecting Disprots source code to download the TSV file.
- Can change the release date and constraints on the documented sequences.
- NB: TSV format and consensus false must remain to work with the rest of the processing steps.
- Project has been done using the following:
 - year = '2022_06'
 - ambiguous = 'true'
 - obsolete = 'false'
 - format = 'tsv'
 - consensus = 'false'

#### Creating a pandas dataframe containing accession IDs and disordered region locations.
- This is used to generate labels for our disordered sequences.

In [None]:
def get_disprot_dataset():
  year = '2022_06'
  ambiguous = 'true'
  obsolete = 'false'
  format = 'tsv'
  # The rest of the processing only works if consensus is false. Assumes region file format.
  consensus = 'false'

  disprot_link = "https://disprot.org/api/search?release="+year+"&show_ambiguous="+ambiguous+"&show_obsolete="+obsolete+"&format="+format+"&namespace=all&get_consensus="+consensus
  with open('/content/drive/My Drive/Colab Notebooks/diss_files/downloaded_disprot_data.tsv', 'w') as outfile:
    outfile.write( requests.get(disprot_link).text )

  # Data from DisProt TSV - https://disprot.org/download
  data_disprot = pd.read_csv('/content/drive/My Drive/Colab Notebooks/diss_files/downloaded_disprot_data.tsv', sep='\t')
  return data_disprot

def create_dataframe_of_idrs(data_disprot):
  # Dictionary for disordered region data from DisProt
  disorder_start_and_end = {}
  for i, acc in enumerate(data_disprot['acc']):
    s = data_disprot['start'][i]
    e = data_disprot['end'][i]
    arr = disorder_start_and_end.get((str(acc)), [])
    if (s, e) not in arr:
      disorder_start_and_end[str(acc)] = arr + [(s, e)]
      
  # Create new table for important DisProt data
  data = {'acc': disorder_start_and_end.keys(), 'disordered_regions': disorder_start_and_end.values()}
  pandas_data = pd.DataFrame.from_dict(data)
  return pandas_data


In [None]:
data_disprot = get_disprot_dataset()
pandas_data = create_dataframe_of_idrs(data_disprot)

In [None]:
pandas_data

Unnamed: 0,acc,disordered_regions
0,P03265,"[(294, 334), (454, 464)]"
1,P49913,"[(134, 170)]"
2,P03045,"[(1, 107), (1, 22), (34, 47), (1, 36)]"
3,P00004,"[(1, 104), (2, 105)]"
4,P27695,"[(1, 42), (1, 36), (32, 43), (2, 40)]"
...,...,...
2414,A0A5P2U9X4,"[(350, 525), (460, 521), (417, 426), (450, 525)]"
2415,P40939,"[(637, 647)]"
2416,Q6CSX2,"[(562, 831)]"
2417,Q8IYT8,"[(168, 177)]"


#### Downloading the full protein sequences.
- Downloads these full sequences from UniProt.
 - If an empty string is returned this means the protein has been deprecated and we will not include this in our dataset.
- Creates a dataframe

The proteins in this dataframe are preprocessed to get their full sequences from UniProt.

In [None]:
def preprocess_sequences(pandas_data):
  protSeqDict = {}
  for row in range(len(pandas_data)):
    acc = pandas_data['acc'].loc[row]

    url = f'https://www.uniprot.org/uniprotkb/{str(acc)}.fasta'
    uniprot_fasta = requests.get(url).text
    # Gets the sequence as a string of amino acids
    protein_sequence = uniprot_fasta.split('\n')[1:]
    protein_sequence = ''.join(protein_sequence)

    if protein_sequence == '':
      continue

    protSeqDict[acc] = protein_sequence
  return protSeqDict

protein_sequences_n_ids = preprocess_sequences(pandas_data)

In [None]:
# Save preprocessed data
def write_sequences():
  with open('/content/drive/My Drive/Colab Notebooks/diss_files/sequence_data.json', 'w') as outfile:
    json.dump(protein_sequences_n_ids, outfile)

# Quick access to preprocessed data, instead of downloading it each time Notebook is opened.
def read_sequences():
  with open('/content/drive/My Drive/Colab Notebooks/diss_files/sequence_data.json', 'r') as infile:
    return json.load(infile)

# Commented out write sequences, so this file is not accidentally overwritten before preprocessing is ran
#write_sequences()
protein_sequences_n_ids = read_sequences()

#### Removing protein data that is incompatible with my solution.
- Removing deprecated entries from dataset. This empty string broke when compared to a label the length of the deprecated sequence.
- Removing ambiguous sequences from dataset. My solution only handles the 20 known amino acid codes.

In [None]:
def cleaning_pandas_data():

  # Removing deprecated entries
  clean_pandas_data = pandas_data
  non_deprecated_acc = protein_sequences_n_ids.keys()
  for acc in pandas_data['acc']:
    if acc in non_deprecated_acc:
      continue
    else:
      index_to_drop = clean_pandas_data[clean_pandas_data['acc'] == acc].index.tolist()[0]
      clean_pandas_data = clean_pandas_data.drop(index_to_drop)

  # Removing ambiguous sequences
  for acc in clean_pandas_data['acc']:
    seq = protein_sequences_n_ids.get(acc)
    # These letter codes are used when an amino acid is ambiguous
    if 'X' in seq or 'U' in seq or 'Z' in seq:
      index_to_drop = clean_pandas_data[clean_pandas_data['acc'] == acc].index.tolist()[0]
      clean_pandas_data = clean_pandas_data.drop(index_to_drop)

  return clean_pandas_data.reset_index().drop(columns=['index'])

def write_cleaned_pandas_data(pandas_df):
  pandas_df.to_json('/content/drive/My Drive/Colab Notebooks/diss_files/idr_pandas_table.json')

def read_cleaned_pandas_data():
  return pd.read_json('/content/drive/My Drive/Colab Notebooks/diss_files/idr_pandas_table.json')

In [None]:
cleaned_pandas_data = cleaning_pandas_data()
write_cleaned_pandas_data(cleaned_pandas_data)

In [None]:
fully_clean_pandas_data = read_cleaned_pandas_data()