In [1]:
import pandas as pd
import numpy as np
from glob import glob #read directory system to import files
import re
import nltk

%matplotlib inline

In [2]:
OHCO = ['book_id','chap_num', 'para_num', 'sent_num', 'token_num']
epub_dir = 'epubs'

In [3]:
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
letters = '[A-Za-z*]+'
chap_pats = {
    121: {
        'start_line': 23,
        'end_line': 8019,
        'book': re.compile('^\s*BOOK\s+{}\.$'.format(roman)),
        'chapter': re.compile("^CHAPTER\s+\d+$")
    },
     42671: {
        'start_line': 25,
        'end_line': 13356,
        'book': re.compile('^\s*BOOK\s+{}\.$'.format(roman)),
        'chapter': re.compile('^\s*CHAPTER\s+{}\.$'.format(roman))
    },
     141: {
        'start_line': 23,
        'end_line': 15702,
        'book': re.compile('^\s*BOOK\s+{}\.$'.format(roman)),
        'chapter': re.compile('^\s*CHAPTER\s+{}$'.format(roman))
     }
}

In [4]:
def acquire_epubs(epub_list, chap_pats, OHCO=OHCO):
    
    my_lib = []
    my_doc = []

    for epub_file in epub_list:
        
        # Get PG ID from filename
        book_id = int(epub_file.split('-')[-1].split('.')[0].replace('pg',''))
        print("BOOK ID", book_id)
        
        # Import file as lines
        lines = open(epub_file, 'r', encoding='utf-8-sig').readlines()
        df = pd.DataFrame(lines, columns=['line_str'])
        df.index.name = 'line_num'
        df.line_str = df.line_str.str.strip()
        df['book_id'] = book_id
        
        # FIX CHARACTERS TO IMPROVE TOKENIZATION
        df.line_str = df.line_str.str.replace('—', ' — ')
        
        # Get book title and put into LIB table -- note problems, though
        book_title = re.sub(r"The Project Gutenberg eBook( of|,) ", "", df.loc[0].line_str, flags=re.IGNORECASE)
        book_title = re.sub(r"Project Gutenberg's ", "", book_title, flags=re.IGNORECASE)
        
        # Remove cruft
        a = chap_pats[book_id]['start_line'] - 1
        b = chap_pats[book_id]['end_line'] + 1
        df = df.iloc[a:b]
        
        # Chunk by chapter
        chap_lines = df.line_str.str.match(chap_pats[book_id]['chapter'])
        chap_nums = [i+1 for i in range(df.loc[chap_lines].shape[0])]
        df.loc[chap_lines, 'chap_num'] = chap_nums
        df.chap_num = df.chap_num.ffill()

        # Clean up
        df = df[~df.chap_num.isna()] # Remove chapter heading lines
        df = df.loc[~chap_lines] # Remove everything before Chapter 1
        df['chap_num'] = df['chap_num'].astype('int')
        
        # Group -- Note that we exclude the book level in the OHCO at this point
        df = df.groupby(OHCO[1:2]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string
        
        # Split into paragrpahs
        df = df['line_str'].str.split(r'\n\n+', expand=True).stack().to_frame().rename(columns={0:'para_str'})
        df.index.names = OHCO[1:3] # MAY NOT BE NECESSARY UNTIL THE END
        df['para_str'] = df['para_str'].str.replace(r'\n', ' ').str.strip()
        df = df[~df['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs
        
        # Set index
        df['book_id'] = book_id
        df = df.reset_index().set_index(OHCO[:3])

        # Register
        my_lib.append((book_id, book_title, epub_file))
        my_doc.append(df)

    docs = pd.concat(my_doc)
    library = pd.DataFrame(my_lib, columns=['book_id', 'book_title', 'book_file']).set_index('book_id')
    return library, docs

In [5]:
epubs = [epub for epub in sorted(glob(epub_dir+'/*.txt'))]
LIB, DOC = acquire_epubs(epubs, chap_pats)

BOOK ID 141
BOOK ID 121
BOOK ID 42671


In [20]:
DOC

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
book_id,chap_num,para_num,Unnamed: 3_level_1
141,49,1,"About thirty years ago Miss Maria Ward, of Hun..."
141,49,2,"Their homes were so distant, and the circles i..."
141,49,3,The letter was not unproductive. It re-establi...
141,49,4,"Such were its immediate effects, and within a ..."
141,49,5,Sir Thomas could not give so instantaneous and...
...,...,...,...
42671,61,15,"With the Gardiners, they were always on the mo..."
42671,61,16,* * * * *
42671,61,17,Transcriber's note:
42671,61,18,Spelling and hyphen changes have been made so ...


In [6]:
LIB

Unnamed: 0_level_0,book_title,book_file
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
141,"Mansfield Park, by Jane Austen",epubs/AUSTEN_JANE_MANSFIELD_PARK-pg141.txt
121,"Northanger Abbey, by Jane Austen",epubs/AUSTEN_JANE_NORTHANGER_ABBEY-pg121.txt
42671,"Pride and Prejudice, by Jane Austen, Edited",epubs/AUSTEN_JANE_PRIDE_AND_PREJUDICE-pg42671.txt


Tokenize

In [7]:
def tokenize(doc_df, remove_pos_tuple=False, OHCO=OHCO):
    
    # Paragraphs to Sentences
    df = doc_df.para_str\
        .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    
    # Sentences to Tokens
    # .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
    df = df.sent_str\
        .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    
    # Grab info from tuple
    df['pos'] = df.pos_tuple.apply(lambda x: x[1])
    df['token_str'] = df.pos_tuple.apply(lambda x: x[0])
    if remove_pos_tuple:
        df = df.drop('pos_tuple', 1)
    
    # Add index
    df.index.names = OHCO
    
    return df

In [8]:
TOKEN = tokenize(DOC)

In [9]:
TOKEN.sample(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
42671,54,29,0,10,"(to, TO)",TO,to
42671,44,10,0,43,"(or, CC)",CC,or
42671,54,27,0,20,"(be, VB)",VB,be
42671,18,70,0,22,"(her, PRP$)",PRP$,her
121,54,2,7,37,"(different, JJ)",JJ,different
141,95,30,10,4,"(astonished, JJ)",JJ,astonished
141,82,25,1,1,"(myself,, NN)",NN,"myself,"
42671,42,3,3,2,"(consequently, RB)",RB,consequently
141,66,4,1,45,"(all, DT)",DT,all
141,91,4,4,27,"(the, DT)",DT,the


In [10]:
TOKEN['term_str'] = TOKEN['token_str'].str.lower().str.replace('[\W_]', '')

VOCAB = TOKEN.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'

VOCAB['num'] = VOCAB.term_str.str.match("\d+").astype('int')

In [11]:
VOCAB

Unnamed: 0_level_0,term_str,n,num
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,922,0
1,1,1,1
2,10,3,1
3,14th,2,1
4,15th,1,1
...,...,...,...
11878,youths,1,0
11879,youwill,1,0
11880,zeal,5,0
11881,zealous,1,0


In [12]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [13]:
sw.sample(10)

Unnamed: 0_level_0,dummy
term_str,Unnamed: 1_level_1
then,1
mightn't,1
be,1
haven't,1
if,1
isn't,1
that,1
she,1
this,1
the,1


In [14]:
VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [15]:
VOCAB[VOCAB.stop == 1].sample(10)

Unnamed: 0_level_0,term_str,n,num,stop
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9759,some,639,0,1
4791,had,3521,0,1
4928,having,341,0,1
11616,while,229,0,1
11211,until,1,0,1
4450,further,29,0,1
10490,than,880,0,1
11516,we,749,0,1
5035,herself,662,0,1
11589,were,1543,0,1


Porter Stems

In [16]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
VOCAB['p_stem'] = VOCAB.term_str.apply(stemmer.stem)

VOCAB.sample(10)

Unnamed: 0_level_0,term_str,n,num,stop,p_stem
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11447,wan,1,0,0,wan
4121,finely,2,0,0,fine
6198,librarytheir,1,0,0,librarytheir
3631,envyings,1,0,0,envi
11181,unsettled,8,0,0,unsettl
10642,tis,10,0,0,ti
179,acutest,1,0,0,acutest
10613,tidings,9,0,0,tide
11505,watsons,1,0,0,watson
8292,questionof,1,0,0,questionof


In [17]:
TOKEN

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
141,49,1,0,0,"(About, IN)",IN,About,about
141,49,1,0,1,"(thirty, CD)",CD,thirty,thirty
141,49,1,0,2,"(years, NNS)",NNS,years,years
141,49,1,0,3,"(ago, RB)",RB,ago,ago
141,49,1,0,4,"(Miss, NNP)",NNP,Miss,miss
...,...,...,...,...,...,...,...,...
42671,61,19,0,4,"(GUTENBERG, NNP)",NNP,GUTENBERG,gutenberg
42671,61,19,0,5,"(EBOOK, NNP)",NNP,EBOOK,ebook
42671,61,19,0,6,"(PRIDE, NNP)",NNP,PRIDE,pride
42671,61,19,0,7,"(AND, NNP)",NNP,AND,and


In [18]:
token1 = TOKEN
pos_max = token1.groupby(['term_str',"pos"]).count().sort_values("token_str", ascending = False).groupby(level=0).head(1)\
    .reset_index().set_index('term_str')
pos_max.sort_index().tail(200)
VOCAB['pos_max'] = VOCAB.term_str.map(pos_max.pos)
VOCAB

Unnamed: 0_level_0,term_str,n,num,stop,p_stem,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,,922,0,0,,NN
1,1,1,1,0,1,JJ
2,10,3,1,0,10,NN
3,14th,2,1,0,14th,CD
4,15th,1,1,0,15th,CD
...,...,...,...,...,...,...
11878,youths,1,0,0,youth,NNS
11879,youwill,1,0,0,youwil,VB
11880,zeal,5,0,0,zeal,NN
11881,zealous,1,0,0,zealou,JJ


save csvs

In [19]:
DOC.to_csv('DOC.csv')
LIB.to_csv('LIB.csv')
VOCAB.to_csv('VOCAB.csv')
TOKEN.to_csv('TOKEN.csv')