In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('whitegrid')
%matplotlib inline

In [None]:
# Read data files
df_dup = pd.read_csv('~/Desktop/Springboard/Protein_Classification/protein-data-set/pdb_data_no_dups.csv')
df_seq = pd.read_csv('~/Desktop/Springboard/Protein_Classification/protein-data-set/pdb_data_seq.csv')

In [None]:
df_seq.drop_duplicates(subset='sequence', inplace=True)

In [None]:
# Let's count the number of unique structure ids
print('Unique structureId: ', df_dup.structureId.nunique())
print('Unique structureId-seq: ', df_seq.structureId.nunique())
print('Unique sequence: ', df_seq.sequence.nunique())
print('Unique classes: ', df_dup.classification.nunique())
print('Unique residueCount: ', df_dup.residueCount.nunique())
# We find the number of repetition of sequence
print('10 most common structure Id frequency: \n', df_seq.structureId.value_counts()[:10])
# Let's check if the repeated entries with same structureId have the different sequences or not
dfst = df_seq.structureId.value_counts()

# Count the number of structure ids with more than one sequences:
count = 0
for i in range(len(dfst)):
    if dfst.values[i] > 1:
        count += 1
print('Number of structureId with multiple entries: ', count)
print('--'*20)
print('Are all structureId with multiple entries corresponds unique sequence?')

str_counts= 0
for i in range(100):
    if df_seq[df_seq['structureId'] == dfst.keys()[i]].sequence.nunique() > 1:
        str_counts += 1
if str_counts > 1:
    print('Not all of those multiple entries have unique sequences but several of those have')

In [None]:
# We see that there are multiple entries with same structure id 
# some of which have same sequences and others with different sequences too. 
# We only keep one sequence corresponding to the one structureId, therefore, we merge 
# two datafiles by using 'inner' merge on 'structureId' and remove all 
# entries with duplicate sequences. 

In [None]:
# Merge two dataset on common columns using 'inner join' method:
#df_merge = pd.concat([df_dup, df_seq], join='inner', axis=1)
df_merge = pd.merge(df_dup, df_seq,
                    how='inner',
                    on=['structureId',
                        'macromoleculeType',
                        'residueCount'])

In [None]:
df_merge.info()

In [None]:
len(df_dup.structureId), len(df_seq.structureId), len(df_merge.structureId)

In [None]:
df_merge.sequence.nunique()

In [None]:
dfcls = df_merge.groupby('macromoleculeType').count().sort_values('sequence', ascending=False)['sequence'][:5]

In [None]:
# Plot the types of macromolecules in the form of Pie chart. 
fig = plt.figure(figsize=(6, 4))
ax = fig.add_axes([0, 0, 1, 1])
ax.axis('equal')
explode=(0,0,0,0.05,0.4)
ax.pie(dfcls.values,
       explode=explode,
       autopct='%1.0f%%',
       labels=dfcls.index,
       radius=1.2,
       pctdistance=0.7,
       labeldistance=1.0,
       textprops={'fontsize': 14})
plt.title('Types of Macromolecules', y =1, fontsize=24)
plt.tight_layout()
plt.savefig('PieChart.pdf')

In [None]:
#Select only protein samples, not others and others with proteins

df_prot = df_merge[df_merge['macromoleculeType']=='Protein']
df_prot.sequence.nunique()

In [None]:
# Delete the samples with missing class and sequence
df_cls = df_prot[[type(c)== type('') for c in df_prot.classification.values]]
df_cs = df_cls[[type(c) == type('') for c in df_cls.sequence.values]]

df_cs.shape, df_cls.shape, df_prot.shape

In [None]:
# Data columns with missing values:
plt.figure(figsize=(10, 6))
sns.heatmap(df_cs.isnull(), cbar=False, yticklabels=False)
plt.show()

In [None]:
dff = df_cs.reset_index(drop=True)
dff.classification.nunique()


###  Class frequency and sequence length frequency

#### class separation:

In [None]:
dff['classification'] = dff.classification.astype('str')
dff['classification'] = dff.classification.str.lower()
dff['classification'] = dff.classification.str.replace(',', '/')
dff['classification'] = dff.classification.str.replace(', ', '/')
dff['classification'] = dff.classification.str.replace('/ ', '/')
dff['classification'] = dff.classification.str.replace(' /', '/')
dff['classification'] = dff.classification.str.replace(' ,', '/')
dff['classification'] = dff.classification.str.replace('(', '/')
dff['classification'] = dff.classification.str.replace(')', '')
dff['classification'] = dff.classification.apply(lambda x:'/'.join(sorted(x.split('/'))))

In [None]:
dff[['classification', 'sequence']].nunique()

In [None]:
classes = dff.classification.value_counts()

In [None]:
class_dict = dict()
count = 0
counts = 0

for cat, num in classes.items():
    if num < 2: 
        condition = (dff['classification'] == cat)
        dff = dff[~condition].copy()
    
        
    if num >= 2:
        class_dict[cat] = count
        count += 1
        
    counts += 1
    if counts % 100 == 0:
        print('classes', counts)
#class_dict

In [None]:
# Print the frequency the top 20 most common classes:
df_class = dff['classification'].value_counts()[:20]

plt.figure(figsize=(7, 7))
df_class.plot(kind='bar')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.title('Class Frequency of Proteins', fontsize=24)
plt.ylabel('Frequency', fontsize=15)
plt.tight_layout()
plt.savefig('ClassFreqProteins.pdf')

In [None]:
# Assign the target variable as 'label' for the numerical classes:
dff['label'] = dff['classification'].map(class_dict)
len(dff.sequence), len(dff.label)

In [None]:
df = dff
import re
length = []
for x in df.sequence.values:
    x = re.sub(r"\s+", "", x) # remove all the spaces in x if any.
    length.append(len(x))
lengths = pd.DataFrame(length, columns=['lengths']).reset_index()

In [None]:
df = df.reset_index(drop=True).reset_index();

In [None]:
df = df.merge(lengths, on='index', how='inner')

In [None]:
df.drop('index', axis=1, inplace=True)

In [None]:
df_dl = df[['structureId', 'classification', 'sequence', 'lengths', 'label']]

In [None]:
df_dl.info()


In [None]:
# Data for deep learning models.
# This contains only sequences and classifications.

df_dl.to_csv(r'protein_seq-cls.csv') 

In [None]:
df.drop(['publicationYear', 'pdbxDetails',
         'crystallizationTempK', 'macromoleculeType',
         'crystallizationMethod'], axis=1, inplace=True)

In [None]:
df = df.groupby('chainId').ffill()


In [None]:
df.dropna(subset=['densityMatthews',
                  'densityPercentSol',
                  'resolution', 'sequence',
                  'phValue'], inplace=True)

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(df.isnull(), cbar=False, yticklabels=False)
plt.tight_layout()
#plt.savefig('HeatmapForMissingdata.pdf')

In [None]:
df.info()

In [None]:
df.to_csv(r'protein_cls.csv') # Data saved for the Machine learning algorithms