In [1]:
import pandas as pd
import json
import re
import numpy as np
import ast

In [2]:
with open('derived_disorder.mjson') as json_file:      
    data = json_file.readlines()
    data = list(map(json.loads, data)) 

df = pd.DataFrame(data)

In [3]:
df.head()

Unnamed: 0,acc,sequence,ncbi_taxon_id,organism,mobidb_consensus
0,P31994,MGILSFLPVLATESDWADCKSPQPWGHMLLWTAVLFLAPVAGTPAA...,9606,Homo sapiens (Human),"{'disorder': {'derived': [{'regions': [[46, 46..."
1,F2Z602,MGKGKPRGLNSARKLRVHRRNNRWAETTYKKRLLGTAFKSSPFGGS...,284590,Kluyveromyces lactis (strain ATCC 8585 / CBS 2...,"{'disorder': {'derived': [{'regions': [[2, 143..."
2,Q875M3,MGKGKPRGLNSARKLRVHRRNNRWAETTYKKRLLGTAFKSSPFGGS...,28985,Kluyveromyces lactis (Yeast) (Candida sphaerica),"{'disorder': {'derived': [{'regions': [[1, 145..."
3,Q86A17,MSKNILVLGGSGALGAEVVKFFKSKSWNTISIDFRENPNADHSFTI...,44689,Dictyostelium discoideum (Slime mold),"{'disorder': {'derived': [{'regions': [[2, 2, ..."
4,P10855,MKVSTTALAVLLCTMTLCNQVFSAPYGADTPTACCFSYSRKIPRQF...,10090,Mus musculus (Mouse),"{'disorder': {'derived': [{'regions': [[30, 91..."


In [4]:
# Creating new column to find length of each Protein sequence
df['Sequence Length'] = df['sequence'].apply(len)

In [5]:
# Converting mobidb_consensus column to type string
df['mobidb_consensus'] = df['mobidb_consensus'].astype(str)

In [6]:
# Creating new column which contains relevant information from mobidb_consensus column
df['shortened mobidb consensus'] = df['mobidb_consensus'].str.split('\{\'method\'\:\s\'full\'\,\s\'regions\'\:\s').str[-1].str.strip()

In [7]:
# Extracting relevant columns from dataframe
df = df[['acc', 'mobidb_consensus', 'shortened mobidb consensus', 'ncbi_taxon_id', 'organism', 'Sequence Length', 'sequence']]

In [8]:
df.rename(columns={'shortened mobidb consensus': 'shortened_mobidb_consensus'}, inplace=True)

### Calculating the proportion of the sequence which is not being referenced

In [9]:
Row_list =[] 
  
# Iterate over each row 
for rows in df.itertuples(): 
     
    my_list =[rows.shortened_mobidb_consensus] # Create list for the current row
    rowsToString = ''.join(my_list)    # Convert the elements in the rows to sting
    onlyNumbers = re.findall(r'\b\d+\b', rowsToString)  # Remove anything but the digits from each row
    for i in range(0, len(onlyNumbers)): # for each element in the list, convert them to integer
        onlyNumbers[i] = int(onlyNumbers[i])
    # Summing the difference of every 2 elements in the row 
    sumOfDifference = [onlyNumbers[i + 1] - onlyNumbers[i] + 1 for i in range(0,len(onlyNumbers)-1,2)] 
    sumOfList = sum(sumOfDifference) # Summing resulting differences to find the total number of elemenets in the sequences
    Row_list.append(sumOfList) # Adding the resulting numbers to a list    

In [10]:
# Creating column which contains the number of amino acids which are referenced
df['#_AA_referenced'] = Row_list

In [11]:
# Calculating the fraction of references which are missing
df['Fraction Missing'] = 1 - (df['#_AA_referenced']/df['Sequence Length'])

In [12]:
# Removing unnecessary elements in string
df['shortened_mobidb_consensus'] = df['shortened_mobidb_consensus'].str.replace(r'\}\]\}\}$', '')

## Removing unreferenced amino acids and updating the class reference indexes

In [13]:
shortenedSeq_list =[] # defining new lists
updatedIndexes = [] 

for rows in df.itertuples(): # iterate through rows
    sequence = rows.sequence # assign protein sequence to variable
    classReferences = rows.shortened_mobidb_consensus # assign numbers and corresponding class to variable
    result = ast.literal_eval(classReferences)
    newSequence = ''
    
    for item in result: # iterate through class references
        newSequence += sequence[item[0]-1:item[1]] # remove amino acids which are not referenced
    shortenedSeq_list.append(newSequence)

    # Update the class reference based on removed amino acids
    currentIndex = 1
    for i in range(len(result)): # for each of the class references 
        reference = result[i]
        if reference[0] != currentIndex: # if the first reference number does not correspond to current index
            difference = reference[0] - currentIndex # calculate the difference between reference number and current index
            reference[0] -= difference # update the indexes in the reference
            reference[1] -= difference
        currentIndex = reference[1] + 1 # update current index to move onto next set of references
        updatedIndexes.append(reference) # add updated indexes to list

In [14]:
# Create new column which contains shortened sequences
df['shortened_sequence'] = shortenedSeq_list

In [15]:
df.head()

Unnamed: 0,acc,mobidb_consensus,shortened_mobidb_consensus,ncbi_taxon_id,organism,Sequence Length,sequence,#_AA_referenced,Fraction Missing,shortened_sequence
0,P31994,"{'disorder': {'derived': [{'regions': [[46, 46...","[[45, 45, 'D'], [46, 49, 'C'], [50, 66, 'S'], ...",9606,Homo sapiens (Human),310,MGILSFLPVLATESDWADCKSPQPWGHMLLWTAVLFLAPVAGTPAA...,173,0.441935,AAPPKAVLKLEPQWINVLQEDSVTLTCRGTHSPESDSIQWFHNGNL...
1,F2Z602,"{'disorder': {'derived': [{'regions': [[2, 143...","[[1, 1, 'D'], [2, 143, 'S'], [144, 145, 'C']]",284590,Kluyveromyces lactis (strain ATCC 8585 / CBS 2...,145,MGKGKPRGLNSARKLRVHRRNNRWAETTYKKRLLGTAFKSSPFGGS...,145,0.0,MGKGKPRGLNSARKLRVHRRNNRWAETTYKKRLLGTAFKSSPFGGS...
2,Q875M3,"{'disorder': {'derived': [{'regions': [[1, 145...","[[1, 145, 'S']]",28985,Kluyveromyces lactis (Yeast) (Candida sphaerica),145,MGKGKPRGLNSARKLRVHRRNNRWAETTYKKRLLGTAFKSSPFGGS...,145,0.0,MGKGKPRGLNSARKLRVHRRNNRWAETTYKKRLLGTAFKSSPFGGS...
3,Q86A17,"{'disorder': {'derived': [{'regions': [[2, 2, ...","[[1, 1, 'D'], [2, 2, 'C'], [3, 37, 'S'], [38, ...",44689,Dictyostelium discoideum (Slime mold),231,MSKNILVLGGSGALGAEVVKFFKSKSWNTISIDFRENPNADHSFTI...,231,0.0,MSKNILVLGGSGALGAEVVKFFKSKSWNTISIDFRENPNADHSFTI...
4,P10855,"{'disorder': {'derived': [{'regions': [[30, 91...","[[24, 29, 'D'], [30, 91, 'S'], [92, 92, 'D']]",10090,Mus musculus (Mouse),92,MKVSTTALAVLLCTMTLCNQVFSAPYGADTPTACCFSYSRKIPRQF...,69,0.25,APYGADTPTACCFSYSRKIPRQFIVDYFETSSLCSQPGVIFLTKRN...


#### Reconstructing new indexes and classes 

In [16]:
df['lettersOnly'] = df['shortened_mobidb_consensus'].str.replace(r'[[\d\,\s\]]', '')

  compiled = re.compile(pat, flags=flags)


In [17]:
letterList = []

# Structuring class references and adding to a list
for rows in df.itertuples():
    myletterList = rows.lettersOnly
    toString = ''.join(myletterList)
    onlyLetters = re.findall(r'[a-zA-z]', toString)
    letterList.append(onlyLetters)

In [18]:
# Combining classes with corresponding positions
newIndex, c = [], 0
for i in [len(x) for x in letterList]:
    newIndex.append(updatedIndexes[c: c + i])
    c += i

In [19]:
df['Updated Indexes'] = newIndex

In [20]:
del df['lettersOnly']

In [21]:
df.head()

Unnamed: 0,acc,mobidb_consensus,shortened_mobidb_consensus,ncbi_taxon_id,organism,Sequence Length,sequence,#_AA_referenced,Fraction Missing,shortened_sequence,Updated Indexes
0,P31994,"{'disorder': {'derived': [{'regions': [[46, 46...","[[45, 45, 'D'], [46, 49, 'C'], [50, 66, 'S'], ...",9606,Homo sapiens (Human),310,MGILSFLPVLATESDWADCKSPQPWGHMLLWTAVLFLAPVAGTPAA...,173,0.441935,AAPPKAVLKLEPQWINVLQEDSVTLTCRGTHSPESDSIQWFHNGNL...,"[[1, 1, D], [2, 5, C], [6, 22, S], [23, 57, C]..."
1,F2Z602,"{'disorder': {'derived': [{'regions': [[2, 143...","[[1, 1, 'D'], [2, 143, 'S'], [144, 145, 'C']]",284590,Kluyveromyces lactis (strain ATCC 8585 / CBS 2...,145,MGKGKPRGLNSARKLRVHRRNNRWAETTYKKRLLGTAFKSSPFGGS...,145,0.0,MGKGKPRGLNSARKLRVHRRNNRWAETTYKKRLLGTAFKSSPFGGS...,"[[1, 1, D], [2, 143, S], [144, 145, C]]"
2,Q875M3,"{'disorder': {'derived': [{'regions': [[1, 145...","[[1, 145, 'S']]",28985,Kluyveromyces lactis (Yeast) (Candida sphaerica),145,MGKGKPRGLNSARKLRVHRRNNRWAETTYKKRLLGTAFKSSPFGGS...,145,0.0,MGKGKPRGLNSARKLRVHRRNNRWAETTYKKRLLGTAFKSSPFGGS...,"[[1, 145, S]]"
3,Q86A17,"{'disorder': {'derived': [{'regions': [[2, 2, ...","[[1, 1, 'D'], [2, 2, 'C'], [3, 37, 'S'], [38, ...",44689,Dictyostelium discoideum (Slime mold),231,MSKNILVLGGSGALGAEVVKFFKSKSWNTISIDFRENPNADHSFTI...,231,0.0,MSKNILVLGGSGALGAEVVKFFKSKSWNTISIDFRENPNADHSFTI...,"[[1, 1, D], [2, 2, C], [3, 37, S], [38, 39, C]..."
4,P10855,"{'disorder': {'derived': [{'regions': [[30, 91...","[[24, 29, 'D'], [30, 91, 'S'], [92, 92, 'D']]",10090,Mus musculus (Mouse),92,MKVSTTALAVLLCTMTLCNQVFSAPYGADTPTACCFSYSRKIPRQF...,69,0.25,APYGADTPTACCFSYSRKIPRQFIVDYFETSSLCSQPGVIFLTKRN...,"[[1, 6, D], [7, 68, S], [69, 69, D]]"


### Calculating the proportion of the sequence which is ordered, disordered or ambiguous

In [22]:
# Extracting the revelant information from string for disordered regions in sequence
df['D_Class'] = df['shortened_mobidb_consensus'].str.extractall(r'(\[[0-9]+\,\s[0-9]+\,\s\'[D]\'\])').unstack().apply(lambda x:','.join(x.dropna()), axis=1)
# Extracting the revelant information from string for ordered regions in sequence
df['S_Class'] = df['shortened_mobidb_consensus'].str.extractall(r'(\[[0-9]+\,\s[0-9]+\,\s\'[S]\'\])').unstack().apply(lambda x:','.join(x.dropna()), axis=1)
# Extracting the revelant information from string for ambiguous regions in sequence
df['C_Class'] = df['shortened_mobidb_consensus'].str.extractall(r'(\[[0-9]+\,\s[0-9]+\,\s\'[C]\'\])').unstack().apply(lambda x:','.join(x.dropna()), axis=1)

# Converting column to type string
df['D_Class'] = df['D_Class'].astype(str)
df['S_Class'] = df['S_Class'].astype(str)
df['C_Class'] = df['C_Class'].astype(str)

In [23]:
D_list =[] 
  
# Iterate over each row 
for rows in df.itertuples(): 
     
    my_Dlist =[rows.D_Class] # Create list for the current row
    DrowsToString = ''.join(my_Dlist)    # Convert the elements in the rows to sting
    DonlyNumbers = re.findall(r'\b\d+\b', DrowsToString)  # Remove anything but the digits from each row
    for i in range(0, len(DonlyNumbers)): # for each element in the list, convert them to integer
        DonlyNumbers[i] = int(DonlyNumbers[i])
    # Summing the difference of every 2 elements in the row 
    sumOfDifferenceD = [DonlyNumbers[i + 1] - DonlyNumbers[i] + 1 for i in range(0,len(DonlyNumbers)-1,2)] 
    sumOfListD = sum(sumOfDifferenceD) # Summing resulting differences to find the total number of elemenets in the sequence are there
    D_list.append(sum(sumOfDifferenceD)) # Adding the resulting numbers to a list

In [24]:
S_list =[] 
  
# Iterate over each row 
for rows in df.itertuples(): 
     
    my_Slist =[rows.S_Class] # Create list for the current row
    SrowsToString = ''.join(my_Slist)    # Convert the elements in the rows to sting
    SonlyNumbers = re.findall(r'\b\d+\b', SrowsToString)  # Remove anything but the digits from each row
    for i in range(0, len(SonlyNumbers)): # for each element in the list, convert them to integer
        SonlyNumbers[i] = int(SonlyNumbers[i])
    # Summing the difference of every 2 elements in the row 
    sumOfDifferenceS = [SonlyNumbers[i + 1] - SonlyNumbers[i] + 1 for i in range(0,len(SonlyNumbers)-1,2)] 
    sumOfListS = sum(sumOfDifferenceS) # Summing resulting differences to find the total number of elemenets in the sequence are there
    S_list.append(sum(sumOfDifferenceS)) # Adding the resulting numbers to a list

In [25]:
C_list =[] 
  
# Iterate over each row 
for rows in df.itertuples(): 
     
    my_Clist =[rows.C_Class] # Create list for the current row
    CrowsToString = ''.join(my_Clist)    # Convert the elements in the rows to sting
    ConlyNumbers = re.findall(r'\b\d+\b', CrowsToString)  # Remove anything but the digits from each row
    for i in range(0, len(ConlyNumbers)): # for each element in the list, convert them to integer
        ConlyNumbers[i] = int(ConlyNumbers[i])
    # Summing the difference of every 2 elements in the row 
    sumOfDifferenceC = [ConlyNumbers[i + 1] - ConlyNumbers[i] + 1 for i in range(0,len(ConlyNumbers)-1,2)] 
    sumOfListC = sum(sumOfDifferenceC) # Summing resulting differences to find the total number of elemenets in the sequence are there
    C_list.append(sum(sumOfDifferenceC)) # Adding the resulting numbers to a list

In [26]:
df['Total_Disordered_Regions'] = D_list
df['Total_Ordered_Regions'] = S_list
df['Total_Ambiguous_Regions'] = C_list

### Creating new dataset with the relevant information needed for next steps

* Conducting analysis of sequences with unreferenced amino acids removed

In [27]:
df2 = df[['organism', 'mobidb_consensus', 'ncbi_taxon_id','shortened_sequence', 'D_Class', 'S_Class', 'C_Class', 'Total_Disordered_Regions', 'Total_Ordered_Regions', 'Total_Ambiguous_Regions']]

In [28]:
df2['Sequence_Length'] = df2['shortened_sequence'].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [29]:
df2['%_Sequence_Disordered'] = (df2['Total_Disordered_Regions']/df2['Sequence_Length'])
df2['%_Sequence_Ordered'] = (df2['Total_Ordered_Regions']/df2['Sequence_Length'])
df2['%_Sequence_Ambiguous'] = (df2['Total_Ambiguous_Regions']/df2['Sequence_Length'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
avgDisordered = df2['%_Sequence_Disordered'].mean()
avgOrdered = df2['%_Sequence_Ordered'].mean()
avgAmbiguous = df2['%_Sequence_Ambiguous'].mean()

### Calculating maximum and minmum sequence lengths

In [31]:
max(df['Sequence Length'])

34350

In [32]:
min(df['Sequence Length'])

10

In [33]:
df.to_csv("dataset.csv")