In [1]:
%load_ext cython

In [2]:
import pandas as pd
from IPython.display import display

In [3]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', -1)
    display(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')


In [4]:
df = pd.read_csv('data/bible.txt', delimiter='\t', header=None)
df.columns=['book', 'text']
#df = df.head(100)

In [5]:
df['ptext'] = df.text.str.lower().str.replace('[^a-z]', '')
df['ptextidx'] = df.ptext.str.len()
df['ptextidx'] = df.ptextidx.cumsum()
df.head()

Unnamed: 0,book,text,ptext,ptextidx
0,Genesis 1:1,In the beginning God created the heaven and th...,inthebeginninggodcreatedtheheavenandtheearth,44
1,Genesis 1:2,"And the earth was without form, and void; and ...",andtheearthwaswithoutformandvoidanddarknesswas...,154
2,Genesis 1:3,"And God said, Let there be light: and there wa...",andgodsaidlettherebelightandtherewaslight,195
3,Genesis 1:4,"And God saw the light, that it was good: and G...",andgodsawthelightthatitwasgoodandgoddividedthe...,261
4,Genesis 1:5,"And God called the light Day, and the darkness...",andgodcalledthelightdayandthedarknesshecalledn...,352


In [6]:
all_text = df.ptext.sum()

In [7]:
%%cython
def find(str phrase, str all_text, int max_n=30, int min_n=3):
    cdef int i
    for i in range(min_n, max_n):
        found = do_find(phrase, all_text, i)
        if found: return found
        
cdef do_find(str phrase, str all_text, int n):
    cdef int start, idx, start_range
    start_range = len(all_text) - n*len(phrase)
    for start in range(start_range):
        if all_text[start] != phrase[0]: continue
        for idx, l in enumerate(phrase):
            if all_text[start + n*idx] != l:
                break
        else: return start, n

In [8]:
%%timeit -n3 -r3
find('secret', all_text[:10000], 30)

2.36 ms ± 204 µs per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [9]:
phrase = 'secret'
offset, skip = find(phrase, all_text, 100, 10)
offset, skip

(2551136, 14)

In [10]:
adjust = -skip // 2
for i in range(len(phrase)):
    print(all_text[i*skip+ offset + adjust :offset + skip*(i+1) + adjust])

servantsthewed
dingisreadybut
theywhichwereb
iddenwerenotwo
rthygoyetheref
oreintothehigh


In [11]:
section = df[(df.ptextidx >= offset) & (df.ptextidx - df.ptext.str.len() <= (offset + skip*(1+len(phrase))))]
print_full(section[['book', 'text']])

Unnamed: 0,book,text
23880,Matthew 22:8,"Then saith he to his servants, The wedding is ready, but they which were bidden were not worthy."
23881,Matthew 22:9,"Go ye therefore into the highways, and as many as ye shall find, bid to the marriage."
