In [1]:
import requests
import re
from nltk.corpus import stopwords
import random
import pandas as pd
from nltk.stem import *
from nltk import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
def get_book(url):
    book = requests.get(url).content.decode("utf-8")
    return book

In [3]:
def remove_punct(book):
    # Remove Punctuation using regex
    return re.sub(r'[^\w\s]','', book)

In [19]:
def remove_numbers(book):
    # Remove Punctuation using regex
    return re.sub('[^A-Za-z]+', ' ', book)

In [4]:
def remove_stopwords(book):
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    return pattern.sub('', book.lower())

In [5]:
def split_book(book):
    # Split book to 100 words
    book_words = book.split()
    n = 100
    parts = [(book_words[i*n : (i+1)*n]) for i in range((len(book_words)+ n-1)//n)]
    
    # Split book to paragraphs
    book_list = []
    for part in parts:
        book_list.append([" ".join(part)])
        
    # Select 200 random paragraphs from the book
    random_200 = random.sample(book_list, 200)
    
    return random_200

In [6]:
def parts_into_df(parts_list, label):
    df = pd.DataFrame(parts_list, columns=['Paragraphs'])
    # add label
    df['Label'] = label
    # add index
    df['Index'] = range(1, len(df)+1)
    
    return df

### The 5 books (Adventure Genre) https://www.gutenberg.org/ebooks/bookshelf/82
- The Three Musketeers by AlexandreDumas
    - https://www.gutenberg.org/ebooks/1257
    - https://www.gutenberg.org/ebooks/1257.txt.utf-8    
- Tarzan of the Apes by Edgar Rice Burroughs  
    - https://www.gutenberg.org/ebooks/78 
    - https://www.gutenberg.org/files/78/78-0.txt    
- The Thirty-Nine Steps by John Buchan  
    - https://www.gutenberg.org/ebooks/558 
    - https://www.gutenberg.org/files/558/558-0.txt    
- The Prisoner of Zenda by Anthony Hope 
    - https://www.gutenberg.org/ebooks/95 
    - https://www.gutenberg.org/files/95/95-0.txt    
- Captain Blood by Rafael Sabatini 
    - https://www.gutenberg.org/ebooks/1965 
    - https://www.gutenberg.org/files/1965/1965-0.txt

In [7]:
urls = ['https://www.gutenberg.org/ebooks/1257.txt.utf-8', 
 'https://www.gutenberg.org/files/78/78-0.txt', 
 'https://www.gutenberg.org/files/558/558-0.txt', 
 'https://www.gutenberg.org/files/95/95-0.txt', 
 'https://www.gutenberg.org/files/1965/1965-0.txt']

lables = ['a', 'b', 'c', 'd', 'e']

url_label = list(zip(urls, lables))
[(url, label) for url, label in url_label]

[('https://www.gutenberg.org/ebooks/1257.txt.utf-8', 'a'),
 ('https://www.gutenberg.org/files/78/78-0.txt', 'b'),
 ('https://www.gutenberg.org/files/558/558-0.txt', 'c'),
 ('https://www.gutenberg.org/files/95/95-0.txt', 'd'),
 ('https://www.gutenberg.org/files/1965/1965-0.txt', 'e')]

In [20]:
remove_numbers(remove_stopwords(remove_punct(get_book(urls[0]))))



In [22]:
books_dict={}
books_dict={"book"+str(index+1):parts_into_df(split_book(remove_stopwords(remove_numbers(remove_punct(get_book(url))))), label= label) 
        for index,(url, label) in enumerate(url_label)} 

In [23]:
books_dict["book4"].head(3)

Unnamed: 0,Paragraphs,Label,Index
0,long could without risk soon door danger force...,d,1
1,strelsau shall caught like rats trap stay sapt...,d,2
2,said fritz perhaps theyll cut mine suggested n...,d,3


In [24]:
# concatenate all dataframe books
books_df = pd.concat([books_dict[book] for book in books_dict])

In [25]:
books_df.head(3)

Unnamed: 0,Paragraphs,Label,Index
0,mme bonacieux knocked shutter three light regu...,a,1
1,dear dartagnan counsel give always lose seemed...,a,2
2,knob door noise de tr villes entrance turned r...,a,3


In [26]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 199
Data columns (total 3 columns):
Paragraphs    1000 non-null object
Label         1000 non-null object
Index         1000 non-null int32
dtypes: int32(1), object(2)
memory usage: 19.5+ KB


In [27]:
books_df.Label.unique()

array(['a', 'b', 'c', 'd', 'e'], dtype=object)

In [28]:
# save to csv file
books_df.to_csv('books_df.csv', index=False)

In [29]:
books_df = pd.read_csv('books_df.csv')
books_df.tail(3)

Unnamed: 0,Paragraphs,Label,Index
997,aside imprecation stepping forward tore palmet...,e,198
998,increase rancour beg observe brought entirely ...,e,199
999,wings exclusion world less fortuitous liberty ...,e,200
