In [1]:
# Lesson 11: File I/O

import os
import glob # get a list of all files that match a pattern

In [2]:
# Use context management using with when working with files.

# with keyword enables context management. 
# Upon entry into a with block, variables have certain meaning. 
# In this case, the variable f has the meaning of an open file, 
# an instance of the _io.TextIOWrapper class. 
# Upon exit, certain operations take place. 
# For file objects created by opening them, 
# the file is automatically closed upon exit, 
# even if there is an error. This is important. 
# If your program raises an exception before you have 
# a chance to close the file, 
# it won’t get closed and you could be in trouble.

with open('data/1OLG.pdb', 'r') as f:
    print(type(f))

<class '_io.TextIOWrapper'>


In [3]:
# Read contents of the file in as a list
with open('data/1OLG.pdb', 'r') as f:
    f_list = f.readlines()

# Look at the list (first ten entries)
f_list[:10]

['HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG              \n',
 'TITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION             \n',
 'TITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR                               \n',
 'COMPND    MOL_ID: 1;                                                            \n',
 'COMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);             \n',
 'COMPND   3 CHAIN: A, B, C, D;                                                   \n',
 'COMPND   4 ENGINEERED: YES                                                      \n',
 'SOURCE    MOL_ID: 1;                                                            \n',
 'SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;                                   \n',
 'SOURCE   3 ORGANISM_COMMON: HUMAN;                                              \n']

In [4]:
# Print the first ten lines of the file
with open('data/1OLG.pdb', 'r') as f:
    i = 0
    while i < 10:
        print(f.readline().rstrip())
        i += 1

HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG
TITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION
TITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR
COMPND    MOL_ID: 1;
COMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);
COMPND   3 CHAIN: A, B, C, D;
COMPND   4 ENGINEERED: YES
SOURCE    MOL_ID: 1;
SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;
SOURCE   3 ORGANISM_COMMON: HUMAN;


In [10]:
# check existence of a file so we do not overwrite it
if os.path.isfile('mastery.txt'):
    raise RuntimeError('File mastery.txt already exists.')

with open('mastery.txt', 'w') as f:
    f.write('This is my file.\n')
    f.write('There are many like it, but this one is mine.\n')
    f.write('The golden ratio is φ = ')
    f.write('{phi:.8f}'.format(phi=1.61803398875)) # can't simply pass numbers. Have to format them as string.

    
!cat mastery.txt

This is my file.
There are many like it, but this one is mine.
The golden ratio is φ = 1.61803399

In [11]:
# An exercise: extract atomic coordinates for first chain in tetramer

# can have multiple files open in our with block, separating them with commas.
with open('data/1OLG.pdb', 'r') as f, open('atoms_chain_A.txt', 'w') as f_out:
    # Put the ATOM lines from chain A in new file
    for line in f:
        if len(line) > 21 and line[:4] == 'ATOM' and line[21] == 'A':
            f_out.write(line)

In [14]:
!head -3 atoms_chain_A.txt

ATOM      1  N   LYS A 319      18.634  25.437  10.685  1.00  4.81           N  
ATOM      2  CA  LYS A 319      17.984  25.295   9.354  1.00  4.32           C  
ATOM      3  C   LYS A 319      18.160  23.876   8.818  1.00  3.74           C  


In [15]:
!tail -3 atoms_chain_A.txt

ATOM    696  H   GLY A 360      -4.589  26.020  10.828  1.00  5.72           H  
ATOM    697  HA2 GLY A 360      -3.950  28.896  10.995  1.00  5.99           H  
ATOM    698  HA3 GLY A 360      -4.341  28.288   9.386  1.00  6.05           H  


In [16]:
# Finding files and with glob

#  let’s say we want to pull the sequence of chain A out of each PDB file.

file_list = glob.glob('data/*.pdb')

file_list

['data/1FAG.pdb', 'data/1J6Z.pdb', 'data/1OLG.pdb', 'data/2ERK.pdb']

In [17]:
# Dictionary to hold sequences
seqs = {}

# Loop through all matching files
for file_name in file_list:
    # Extract PDB ID
    pdb_id = file_name[file_name.find('/')+1:file_name.rfind('.')]

    # Initialize sequence string, which we build as we go along
    seq = ''
    with open(file_name, 'r') as f:
        for line in f:
            if len(line) > 11 and line[:6] == 'SEQRES' and line[11] == 'A':
                seq += line[19:].rstrip() + ' '

    # Build sequence with dash-joined three letter codes
    seq = '-'.join(seq.split())

    # Store in the dictionary
    seqs[pdb_id] = seq

In [18]:
# Let’s take a look at what we got. We’ll look at actin.
seqs['1J6Z']

'ASP-GLU-ASP-GLU-THR-THR-ALA-LEU-VAL-CYS-ASP-ASN-GLY-SER-GLY-LEU-VAL-LYS-ALA-GLY-PHE-ALA-GLY-ASP-ASP-ALA-PRO-ARG-ALA-VAL-PHE-PRO-SER-ILE-VAL-GLY-ARG-PRO-ARG-HIS-GLN-GLY-VAL-MET-VAL-GLY-MET-GLY-GLN-LYS-ASP-SER-TYR-VAL-GLY-ASP-GLU-ALA-GLN-SER-LYS-ARG-GLY-ILE-LEU-THR-LEU-LYS-TYR-PRO-ILE-GLU-HIC-GLY-ILE-ILE-THR-ASN-TRP-ASP-ASP-MET-GLU-LYS-ILE-TRP-HIS-HIS-THR-PHE-TYR-ASN-GLU-LEU-ARG-VAL-ALA-PRO-GLU-GLU-HIS-PRO-THR-LEU-LEU-THR-GLU-ALA-PRO-LEU-ASN-PRO-LYS-ALA-ASN-ARG-GLU-LYS-MET-THR-GLN-ILE-MET-PHE-GLU-THR-PHE-ASN-VAL-PRO-ALA-MET-TYR-VAL-ALA-ILE-GLN-ALA-VAL-LEU-SER-LEU-TYR-ALA-SER-GLY-ARG-THR-THR-GLY-ILE-VAL-LEU-ASP-SER-GLY-ASP-GLY-VAL-THR-HIS-ASN-VAL-PRO-ILE-TYR-GLU-GLY-TYR-ALA-LEU-PRO-HIS-ALA-ILE-MET-ARG-LEU-ASP-LEU-ALA-GLY-ARG-ASP-LEU-THR-ASP-TYR-LEU-MET-LYS-ILE-LEU-THR-GLU-ARG-GLY-TYR-SER-PHE-VAL-THR-THR-ALA-GLU-ARG-GLU-ILE-VAL-ARG-ASP-ILE-LYS-GLU-LYS-LEU-CYS-TYR-VAL-ALA-LEU-ASP-PHE-GLU-ASN-GLU-MET-ALA-THR-ALA-ALA-SER-SER-SER-SER-LEU-GLU-LYS-SER-TYR-GLU-LEU-PRO-ASP-GLY-GLN-VAL-ILE-THR-ILE

In [19]:
# Computing environment

%load_ext watermark
%watermark -v -p jupyterlab

CPython 3.8.2
IPython 7.16.1

jupyterlab 2.1.5
