In [43]:
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
!pip install rarfile



In [45]:
!pip install requests



In [46]:
import rarfile
import os

rar_file_path = '/content/drive/MyDrive/pdb_links.rar'
extract_dir = './'

os.makedirs(extract_dir, exist_ok=True)

with rarfile.RarFile(rar_file_path) as rf:
    rf.extractall(extract_dir)

print(f"Extracted {rar_file_path} to {extract_dir}.")


Extracted /content/drive/MyDrive/pdb_links.rar to ./.


In [47]:
import os

input_folder = 'pdb_links'
output_file = 'all_pdbs.txt'

txt_files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]

with open(output_file, 'w') as outfile:
    for txt_file in txt_files:
        file_path = os.path.join(input_folder, txt_file)

        with open(file_path, 'r') as infile:
            content = infile.read()
            outfile.write(content + '\n')

print(f"All text files have been merged into {output_file}.")


All text files have been merged into all_pdbs.txt.


In [48]:
!pip install requests beautifulsoup4 pandas biopython




In [49]:
with open('all_pdbs.txt', 'r') as file:
    pdb_ids = file.read().strip().split(',')

print(f"Loaded {len(pdb_ids)} PDB IDs.")


Loaded 222602 PDB IDs.


In [50]:
!pip install beautifulsoup4 lxml




In [51]:
import json

In [52]:
import requests
import pandas as pd

def fetch_publications(pdb_id):
    uniprot_search_url = f"https://rest.uniprot.org/uniprotkb/search?query=(xref:pdb-{pdb_id})"
    response = requests.get(uniprot_search_url)
    if response.status_code == 200:
        r = response.text
        json_object = json.loads(r)

        publications = []

        if 'results' in json_object:
            if 'references' in json_object['results'][0]:
                for ref in json_object['results'][0]['references']:
                    if 'citation' in ref:
                        if 'title' in ref['citation']:
                            publications.append(ref['citation']['title'])

        if len(publications) > 2:
          return publications[:2]
        else:
          return publications
    else:
        print(f"Failed to fetch data for PDB ID {pdb_id}")
        return []

In [53]:
!pip install biopython



In [54]:
from Bio import Entrez

def get_pubmed_abstracts(paper_titles):
    Entrez.email = 'm.zandieh7878@gmail.com'
    abstracts = []
    for title in paper_titles:
        handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
        record = Entrez.read(handle)
        id_list = record["IdList"]
        if id_list:
            pmid = id_list[0]
            handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
            abstract = handle.read().strip()
            abstracts.append(abstract)
        else:
            continue

    return abstracts


In [55]:
def process_text(text):
    lines = text.split('\n')

    processed_lines = lines[2:]

    inside_author_info = False
    inside_text = False
    final_lines = []
    for line in processed_lines:
        if "author information:" in line:
            final_lines.pop()
            for item in final_lines[::-1]:
                if item.strip() != "":
                    final_lines.pop()
                else:
                  break

            inside_author_info = True
        if inside_author_info and line.strip() == "":
            inside_author_info = False
            inside_text = True
            continue
        if inside_text and line.strip() == "":
            break

        if not inside_author_info and not line.startswith(("doi", "pmid", "copyright")):
            final_lines.append(line)
    return '\n'.join(final_lines)


In [56]:
def get_abst(pdb_id):
    abstracts = get_pubmed_abstracts(fetch_publications(pdb_id))
    abstracts2 = []
    for item in abstracts:
      abstracts2.append(process_text(item.lower()).strip())
    return abstracts2

In [57]:
def fetch_uniprot_data(pdb_id):
    uniprot_search_url = f"https://rest.uniprot.org/uniprotkb/search?query=(xref:pdb-{pdb_id})&fields=protein_name,cc_function"
    response = requests.get(uniprot_search_url)
    if response.status_code == 200:
        r = response.text
        json_object = json.loads(r)
        if json_object.get('results'):
          protein_name = None
          if 'proteinDescription' in json_object['results'][0] and \
             'recommendedName' in json_object['results'][0]['proteinDescription'] and \
             'fullName' in json_object['results'][0]['proteinDescription']['recommendedName']:
              protein_name = json_object['results'][0]['proteinDescription']['recommendedName']['fullName']['value']


          function_text = ""

          if 'comments' in json_object['results'][0]:
            for comment in json_object['results'][0]['comments']:
                if comment['commentType'] == "FUNCTION":
                    function_text = comment['texts'][0]['value']
                    break

          protein_function = function_text
          protein_id = json_object['results'][0]['primaryAccession']

          if not protein_name:
            return None, None, None

          if protein_function:
            return protein_id, protein_name, protein_function
          else:
            return None, None, None

    return None, None, None

In [58]:
!apt-get install dssp

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
dssp is already the newest version (4.0.4-1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [59]:
!pip install biopython



In [60]:
import subprocess
from Bio.PDB import PDBParser, DSSP
def parse_dssp(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    start = False
    secondary_structure = []
    for line in lines:
        if line.startswith("  #  RESIDUE AA STRUCTURE"):
            start = True
            continue
        if start:
            aa = line[13]
            ss = line[16]
            if ss == " ":
                ss = "C"
            secondary_structure.append((aa, ss))

    return secondary_structure

def fetch_secondary_structure1(pdb_id):
    !wget https://files.rcsb.org/download/{pdb_id}.pdb

    parser = PDBParser()
    structure = parser.get_structure(pdb_id, f"{pdb_id}.pdb")

    model = structure[0]
    dssp = DSSP(model, f"{pdb_id}.pdb")

    secondary_structure_q8 = [(dssp[key][1], dssp[key][2]) for key in dssp.keys()]

    return secondary_structure_q8


In [61]:
def fetch_secondary_structure(pdb_id):
    pdb_file = f"{pdb_id}.pdb"
    dssp_output = f"{pdb_id}.dssp"

    result = subprocess.run(["wget", f"https://files.rcsb.org/download/{pdb_id}.pdb"], capture_output=True, text=True)
    if result.returncode != 0:
        return None

    try:
        subprocess.run(["mkdssp", "-i", pdb_file, "-o", dssp_output])

        parser = PDBParser()
        structure = parser.get_structure(pdb_id, pdb_file)

        model = structure[0]
        dssp = DSSP(model, pdb_file)

        secondary_structure_q8 = [(dssp[key][1], dssp[key][2]) for key in dssp.keys()]
    except Exception as e:
        print(f"Error processing PDB file for {pdb_id}: {e}")
        return None, None
    finally:
        if os.path.exists(pdb_file):
            os.remove(pdb_file)
        if os.path.exists(dssp_output):
            os.remove(dssp_output)

    return secondary_structure_q8


In [62]:
import shutil

def run(start, end):
    data = []
    i = 0
    for pdb_id in pdb_ids[start:end]:
        i += 1
        print(i, '/', end - start)

        uniprot_id, protein_names, functions = fetch_uniprot_data(pdb_id)
        if not functions:
            continue
        secondary_structures_q8 = fetch_secondary_structure(pdb_id)
        abstracts = get_abst(pdb_id)
        data.append([pdb_id, protein_names, uniprot_id, functions, ' '.join(abstracts), secondary_structures_q8])

    df = pd.DataFrame(data, columns=['PDB ID', 'Protein Name', 'UniProt ID', 'Functionality', 'Publications', 'Secondary Structures Q8'])
    filename = f'protein_data_{start}-{end}.csv'
    df.to_csv(filename, index=False)

    print(f"Data has been saved to {filename}.")

    drive_dir = '/content/drive/My Drive/nlp_final_dataset'
    if not os.path.exists(drive_dir):
        os.makedirs(drive_dir)
    shutil.move(filename, os.path.join(drive_dir, filename))

In [63]:
import re

drive_dir = '/content/drive/My Drive/nlp_final_dataset'


chunk_size = 100
s = 0
e = 45000


for filename in os.listdir(drive_dir):
  match = re.search(r'(\d+)-(\d+)', filename)
  if match:
    number1 = int(match.group(1))
    number2 = int(match.group(2))
    s = max(s, number1, number2)

for start in range(s, e, chunk_size):
    print('step number ',int(start/chunk_size)+1,' of ',int(e/chunk_size), ' started.')
    end = min(start + chunk_size, len(pdb_ids))
    run(start, end)

step number  1  of  45000  started.
1 / 1
Data has been saved to protein_data_0-1.csv.
step number  2  of  45000  started.
1 / 1
Data has been saved to protein_data_1-2.csv.
step number  3  of  45000  started.
1 / 1




KeyboardInterrupt: 