# Convert to CSV

This notebook converts data from [Unsupervised protein embeddings outperform hand-crafted sequence and structure features at predicting molecular function](https://academic.oup.com/bioinformatics/article/37/2/162/5892762#supplementary-data) to format and style requested by [biodatasets](https://github.com/DeepChainBio/bio-datasets) for upload to their open bio data platform.

In [1]:
import pandas as pd
import pickle
import numpy as np

In [2]:
fasta_file = "data_pdb/chains_mf_seqid95_11749.fasta"
Yterms_file = "data_pdb/Yterms.pkl"
icVec_file = "data_pdb/icVec.npy"
output_csv = "for_biodatasets/function_prediction.csv"
embeddings_file = "sequence_protbert_cls_embeddings.npy"

In [3]:
protein_ids = []
sequences = []
file = open(fasta_file, 'r')
for line in file:
    if line[0] == ">": 
        protein_ids.append(line.strip(">\n"))
    elif line != "\n": 
        sequences.append(line.strip())

In [4]:
len(protein_ids)
len(sequences)

11749

In [5]:
#for key, val in Yterms.items():
#    Yterms[key] = val.toarray() # Convert from sparse array to dict

In [6]:
df = pd.DataFrame({'protein_id':protein_ids, 'sequence':sequences})

In [7]:
df

Unnamed: 0,protein_id,sequence
0,12asA,AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLS...
1,139lA,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...
2,169lA,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...
3,174lA,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLAAAADLAAAKAAL...
4,192lA,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLAAAKAAL...
...,...,...
11744,7fabL,ASVLTQPPSVSGAPGQRVTISCTGSSSNIGAGHNVKWYQQLPGTAP...
11745,7odcA,SSFTKDEFDCHILDEGFTAKDILDQKINDKDAFYVADLGDILKKHL...
11746,7pckD,LYPEEILDTHWELWKKTHRKQYNNKVDEISRRLIWEKNLKYISIHN...
11747,8abpA,NLKLGFLVKQPEEPWFQTEWKFADKAGKDLGFEVIKIAVPDGEKTL...


In [304]:
df.to_csv(output_csv)

In [11]:
labels = pickle.load(open("for_biodataset_subset/labels.pkl", 'rb'))
list(labels.values())

, 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 array([[1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0