In [1]:
import sys
import os
import numpy as np
from copy import deepcopy

sys.path.insert(0, os.path.abspath('./src/'))

from pdb_numpy import Coor, Model, format, abinitio

In [2]:
file_in = '../BeEM/example_input/3j6b.cif'
test = Coor(file_in)
test.write("tmp.pdb", overwrite=True)

EMDB EMD-2566 'Electron cryo-microscopy of yeast mitochondrial large ribosomal subunit' 'associated EM volume' 

['EMDB', 'EMD-2566', "'Electron cryo-microscopy of yeast mitochondrial large ribosomal subunit'", "'associated EM volume'"]
['EMDB', 'EMD-2566', "'Electron", 'cryo-microscopy', 'of', 'yeast', 'mitochondrial', 'large', 'ribosomal', "subunit'", "'associated", 'EM', "volume'"]
PDB  3J6B     .                                                                         'complete structure'   

['PDB', '3J6B', '.', "'complete structure'"]
['PDB', '3J6B', '.', "'complete", "structure'"]
PDB  1VW3     'yeast mitochondrial large ribosomal subunit split 1'                     split                  

['PDB', '1VW3', "'yeast mitochondrial large ribosomal subunit split 1'", 'split']
['PDB', '1VW3', "'yeast", 'mitochondrial', 'large', 'ribosomal', 'subunit', 'split', "1'", 'split']
PDB  1VW4     'yeast mitochondrial large ribosomal subunit split 2'                     split                  

In [3]:
line = "EMDB EMD-2566 'Electron cryo-microscopy of yeast mitochondrial large ribosomal subunit' 'associated EM volume' "

In [7]:
delimiters = ["'", " "]
line.split(delimiters)

TypeError: must be str or None, not list

In [77]:
def string_split(str: line):
    
    quote_in = False
    first_quote = ""
    tokens = []
    token = ""
    
    for letter in line:
        if quote_in:
            if letter != first_quote:
                token += letter
            else:
                tokens.append(token)
                token=""
                quote_in = False
        elif letter == "'" or letter == '"':
                first_quote = letter
                quote_in = True
        elif letter != " ":
            token += letter
        else:
            if len(token):
                tokens.append(token)
                token=""
                
    return tokens

In [70]:
line = "A   'RNA linking'       y \"ADENOSINE-5'-MONOPHOSPHATE\" ? 'C10 H14 N5 O7 P' 347.221 "
string_split(line)

quote_done RNA linking
quote_done ADENOSINE-5'-MONOPHOSPHATE
quote_done C10 H14 N5 O7 P


['A',
 'RNA linking',
 'y',
 "ADENOSINE-5'-MONOPHOSPHATE",
 '?',
 'C10 H14 N5 O7 P',
 '347.221']

In [138]:
%timeit string_split(line)

8.68 µs ± 148 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [139]:
%timeit string_split_cython(line)

7.76 µs ± 90.1 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [220]:
%timeit line.split()

402 ns ± 1.78 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [9]:
%load_ext Cython

In [42]:
%%cython -a

def string_split_cython(str line):
    
    cdef:
        bint quote_in = False
        list tokens = []
        char letter
        str token = ""
    
    for letter in line:
        if letter == b"'" or letter == b'"':
            print('YO:', letter)
        elif letter != b" ":
            token += letter

    return tokens

In [371]:
line = "A   'RNA linking'       y \"ADENOSINE-5'-MONOPHOSPHATE\" ? 'C10 H14 N5 O7 P' 347.221   \n"
string_split_cython(line)

TypeError: expected bytes, str found

In [156]:
%%cython -a

def string_split_cython(str line):
    
    cdef:
        int i
        int quote_in = False
        char first_quote 
        list tokens = []
        unicode token = u""
        char letter

    first_quote = ''

    for letter in line:
        if quote_in:
            if letter != first_quote:
                i = 0
                #token += str(letter)
            else:
                #tokens.append(token)
                token = ""
                quote_in = False
        elif letter == b"'" or letter == b'"':
                first_quote = letter
                quote_in = True
        elif letter != b" ":
            i=0
            #token += str(letter)
        else:
            if len(token):
                #tokens.append(token)
                #print(token)
                token=""
                
    return tokens


Error compiling Cython file:
------------------------------------------------------------
...
        char first_quote
        list tokens = []
        unicode token = u""
        char letter

    first_quote = ''
                  ^
------------------------------------------------------------

/home/murail/.cache/ipython/cython/_cython_magic_e85bb5845f6bd396c741c087dbf419773ad294bb.pyx:12:18: Unicode literals do not support coercion to C types other than Py_UNICODE/Py_UCS4 (for characters) or Py_UNICODE* (for strings).


In [226]:
%%cython -a
import cython
from libc.string cimport strcpy, strlen

@cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False) # Deactivate negative indexing.
def string_split_cython(char * line):
    
    cdef:
        size_t i = 0
        int j
        int quote_in = False
        char first_quote = 0, letter
        str token = ""
        list tokens = []

    for letter in line:
        if quote_in:
            if letter != first_quote:
                j = 0
                token += str(letter)
            else:
                tokens.append(token)
                token = ""
                quote_in = False
        elif letter == b"'" or letter == b'"':
                first_quote = letter
                quote_in = True
        elif letter != b" ":
            j = 0
            token += str(letter)
        else:
            if len(token):
                tokens.append(token)
                token=""

    return tokens


In [380]:
%%cython -a
import cython
from libc.string cimport strcpy, strlen

@cython.wraparound(False) # Deactivate negative indexing.
cdef bint is_space (char c):
    if c == b" ":
        return True
    else:
        return False

@cython.wraparound(False) # Deactivate negative indexing.
cdef bint is_quote (char c):
    if c == b"'" or c ==b'"':
        return True
    else:
        return False

@cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False) # Deactivate negative indexing.
def string_split(str line_raw):

    tmp = line_raw.encode('utf-8')
        
    cdef:
        int i=0, j=0, k=0, count=0, str_len
        char first_quote
        list split_list = []
        char * line 

    line = tmp

    str_len = strlen(line)

    while (i<str_len) :
        
        while (i<str_len and is_space(line[i])):
            i += 1
                
        if is_quote(line[i]):
            first_quote = line[i]
            j = i
            i += 1
            while (i<str_len and line[i] != first_quote):
                i += 1
            i += 1
        else:
            j = i
            while (i<str_len and not is_space(line[i])):
                i += 1
        if i != j:
            split_list.append(line[j:i].decode('UTF-8'))
        i+=1
    return split_list
    

In [381]:
line

'A   \'RNA linking\'       y "ADENOSINE-5\'-MONOPHOSPHATE" ? \'C10 H14 N5 O7 P\' 347.221   \n'

In [385]:
line_unicode = line.encode('utf-8')
tmp = string_split(line[:-1])
tmp

['A',
 "'RNA linking'",
 'y',
 '"ADENOSINE-5\'-MONOPHOSPHATE"',
 '?',
 "'C10 H14 N5 O7 P'",
 '347.221']

In [386]:
%timeit string_split(line)

399 ns ± 7.2 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [370]:
%timeit line_unicode.split()

356 ns ± 3.8 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [328]:
%timeit string_split_cython_2(line_unicode)

269 ns ± 2.82 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [129]:
tmp

['65',
 '82786532108105110107105110103',
 '121',
 '6568697879837378694553394577797879807279838072658469',
 '63',
 '674948327249523278533279553280',
 '51525546505049']

In [121]:
tmp[0].encode('UTF-8')

b'65'

In [130]:
string_split(line)

['A',
 'RNA linking',
 'y',
 "ADENOSINE-5'-MONOPHOSPHATE",
 '?',
 'C10 H14 N5 O7 P',
 '347.221']