In [12]:
import time
from nltk.corpus import stopwords
from nltk.tokenize import regexp
from nltk.stem import PorterStemmer
from IPython.display import Markdown, display

stop_words = set(stopwords.words('english'))

def preprocess(text, stem=False):
    start = time.perf_counter()

    ps = PorterStemmer()
    word_tokens = regexp.regexp_tokenize(text.lower(), r'\W+', gaps=True)

    result = ' '.join([ps.stem(w) if stem else w for w in word_tokens if not w in stop_words])

    stop = time.perf_counter()
    return result, stop-start

In [13]:
result, time_elapsed = preprocess('This is a sample sentence, showing off the stop words filtration.', stem=False)
result_stemmed, time_elapsed = preprocess('This is a sample sentence, showing off the stop words filtration.', stem=True)

print(result)
print(result_stemmed)

sample sentence showing stop words filtration
sampl sentenc show stop word filtrat


In [15]:
import os

files = os.listdir('data/gutenberg')
files_count = len(files)

print(files_count)

for i, file_name in enumerate(files):
    
    if '.txt' not in file_name:
        continue
    
    with open(f'data/gutenberg/{file_name}', 'r', encoding= 'utf-8', errors='ignore') as file:
        original_text = file.read()

        preprocessed_text, time_elapsed = preprocess(original_text)
        stemmed_text, time_elapsed_stemming = preprocess(original_text, stem=True)

        file_without_extension = os.path.splitext(file_name)[0]

        with open(f'preprocessed/gutenberg/{file_without_extension}.txt', 'w', encoding= 'utf-8', errors='ignore') as f:
            f.write(preprocessed_text)
        
        with open(f'stemmed/gutenberg/{file_without_extension}.txt', 'w', encoding= 'utf-8', errors='ignore') as f:
            f.write(stemmed_text)

        display(Markdown((f'**{i+1}/{files_count} file processed** - time elapsed {time_elapsed:0.4f} seconds (with stemming {time_elapsed_stemming:0.4f} seconds)')))

90


**2/90 file processed** - time elapsed 0.0129 seconds (with stemming 0.2776 seconds)

**3/90 file processed** - time elapsed 0.4760 seconds (with stemming 9.1965 seconds)

**4/90 file processed** - time elapsed 0.5631 seconds (with stemming 12.6457 seconds)

**5/90 file processed** - time elapsed 0.0300 seconds (with stemming 0.7452 seconds)

**6/90 file processed** - time elapsed 0.0389 seconds (with stemming 0.8835 seconds)

**7/90 file processed** - time elapsed 0.0994 seconds (with stemming 2.2820 seconds)

**8/90 file processed** - time elapsed 0.0972 seconds (with stemming 1.8732 seconds)

**9/90 file processed** - time elapsed 0.0219 seconds (with stemming 0.4978 seconds)

**10/90 file processed** - time elapsed 0.1245 seconds (with stemming 2.4143 seconds)

**11/90 file processed** - time elapsed 0.0659 seconds (with stemming 1.4428 seconds)

**12/90 file processed** - time elapsed 0.0613 seconds (with stemming 1.3587 seconds)

**13/90 file processed** - time elapsed 0.0028 seconds (with stemming 0.0572 seconds)

**14/90 file processed** - time elapsed 0.1246 seconds (with stemming 2.7232 seconds)

**15/90 file processed** - time elapsed 0.0236 seconds (with stemming 0.5668 seconds)

**16/90 file processed** - time elapsed 0.0521 seconds (with stemming 1.2523 seconds)

**17/90 file processed** - time elapsed 0.0617 seconds (with stemming 1.3331 seconds)

**18/90 file processed** - time elapsed 0.1225 seconds (with stemming 2.2696 seconds)

**19/90 file processed** - time elapsed 0.1119 seconds (with stemming 2.2740 seconds)

**20/90 file processed** - time elapsed 0.0960 seconds (with stemming 2.3920 seconds)

**21/90 file processed** - time elapsed 0.0105 seconds (with stemming 0.2142 seconds)

**22/90 file processed** - time elapsed 0.0567 seconds (with stemming 1.1264 seconds)

**23/90 file processed** - time elapsed 0.0493 seconds (with stemming 0.9857 seconds)

**24/90 file processed** - time elapsed 0.0918 seconds (with stemming 2.0761 seconds)

**25/90 file processed** - time elapsed 0.0323 seconds (with stemming 0.7752 seconds)

**26/90 file processed** - time elapsed 0.4511 seconds (with stemming 10.1545 seconds)

**27/90 file processed** - time elapsed 0.0323 seconds (with stemming 0.5292 seconds)

**28/90 file processed** - time elapsed 0.2366 seconds (with stemming 2.6875 seconds)

**29/90 file processed** - time elapsed 0.1169 seconds (with stemming 2.4355 seconds)

**30/90 file processed** - time elapsed 0.0706 seconds (with stemming 1.5559 seconds)

**31/90 file processed** - time elapsed 0.0871 seconds (with stemming 1.9717 seconds)

**32/90 file processed** - time elapsed 0.1038 seconds (with stemming 2.2235 seconds)

**33/90 file processed** - time elapsed 0.2336 seconds (with stemming 5.4062 seconds)

**34/90 file processed** - time elapsed 0.0171 seconds (with stemming 0.4183 seconds)

**35/90 file processed** - time elapsed 0.0507 seconds (with stemming 0.7665 seconds)

**36/90 file processed** - time elapsed 0.1957 seconds (with stemming 4.0445 seconds)

**37/90 file processed** - time elapsed 0.1163 seconds (with stemming 2.5273 seconds)

**38/90 file processed** - time elapsed 0.1496 seconds (with stemming 3.0616 seconds)

**39/90 file processed** - time elapsed 0.1324 seconds (with stemming 2.7095 seconds)

**40/90 file processed** - time elapsed 0.0380 seconds (with stemming 0.8536 seconds)

**41/90 file processed** - time elapsed 0.0547 seconds (with stemming 1.2361 seconds)

**42/90 file processed** - time elapsed 0.1411 seconds (with stemming 2.2779 seconds)

**43/90 file processed** - time elapsed 0.0248 seconds (with stemming 0.6772 seconds)

**44/90 file processed** - time elapsed 0.0809 seconds (with stemming 2.1769 seconds)

**45/90 file processed** - time elapsed 0.0453 seconds (with stemming 1.1152 seconds)

**46/90 file processed** - time elapsed 0.0370 seconds (with stemming 0.8229 seconds)

**47/90 file processed** - time elapsed 0.2065 seconds (with stemming 4.3872 seconds)

**48/90 file processed** - time elapsed 0.0462 seconds (with stemming 0.8242 seconds)

**49/90 file processed** - time elapsed 0.0615 seconds (with stemming 1.4941 seconds)

**50/90 file processed** - time elapsed 0.1026 seconds (with stemming 2.5265 seconds)

**51/90 file processed** - time elapsed 0.1275 seconds (with stemming 2.6859 seconds)

**52/90 file processed** - time elapsed 0.0679 seconds (with stemming 1.7399 seconds)

**53/90 file processed** - time elapsed 0.0599 seconds (with stemming 1.3032 seconds)

**54/90 file processed** - time elapsed 0.1621 seconds (with stemming 3.3843 seconds)

**55/90 file processed** - time elapsed 0.0283 seconds (with stemming 0.6754 seconds)

**56/90 file processed** - time elapsed 0.0649 seconds (with stemming 1.9908 seconds)

**57/90 file processed** - time elapsed 0.0375 seconds (with stemming 1.0171 seconds)

**58/90 file processed** - time elapsed 0.2205 seconds (with stemming 6.1782 seconds)

**59/90 file processed** - time elapsed 0.0328 seconds (with stemming 0.8299 seconds)

**60/90 file processed** - time elapsed 0.0464 seconds (with stemming 1.1795 seconds)

**61/90 file processed** - time elapsed 0.0665 seconds (with stemming 1.5605 seconds)

**62/90 file processed** - time elapsed 0.1119 seconds (with stemming 2.5599 seconds)

**63/90 file processed** - time elapsed 0.0934 seconds (with stemming 1.2718 seconds)

**64/90 file processed** - time elapsed 0.0525 seconds (with stemming 0.8306 seconds)

**65/90 file processed** - time elapsed 0.0559 seconds (with stemming 1.3115 seconds)

**66/90 file processed** - time elapsed 0.0124 seconds (with stemming 0.3330 seconds)

**67/90 file processed** - time elapsed 0.0207 seconds (with stemming 0.6228 seconds)

**68/90 file processed** - time elapsed 0.1807 seconds (with stemming 2.6636 seconds)

**69/90 file processed** - time elapsed 0.0770 seconds (with stemming 2.3736 seconds)

**70/90 file processed** - time elapsed 0.0256 seconds (with stemming 0.6758 seconds)

**71/90 file processed** - time elapsed 0.0807 seconds (with stemming 1.6011 seconds)

**72/90 file processed** - time elapsed 0.0563 seconds (with stemming 1.3966 seconds)

**73/90 file processed** - time elapsed 0.0306 seconds (with stemming 0.8065 seconds)

**74/90 file processed** - time elapsed 0.0673 seconds (with stemming 2.0167 seconds)

**75/90 file processed** - time elapsed 0.0883 seconds (with stemming 1.7815 seconds)

**76/90 file processed** - time elapsed 0.0093 seconds (with stemming 0.1892 seconds)

**77/90 file processed** - time elapsed 0.0367 seconds (with stemming 0.9153 seconds)

**78/90 file processed** - time elapsed 0.0540 seconds (with stemming 1.2515 seconds)

**79/90 file processed** - time elapsed 0.0949 seconds (with stemming 1.8585 seconds)

**80/90 file processed** - time elapsed 0.0952 seconds (with stemming 1.9329 seconds)

**81/90 file processed** - time elapsed 0.0659 seconds (with stemming 1.5539 seconds)

**82/90 file processed** - time elapsed 0.1408 seconds (with stemming 3.4505 seconds)

**83/90 file processed** - time elapsed 0.0690 seconds (with stemming 1.6860 seconds)

**84/90 file processed** - time elapsed 0.0620 seconds (with stemming 1.6140 seconds)

**85/90 file processed** - time elapsed 0.1004 seconds (with stemming 2.1098 seconds)

**86/90 file processed** - time elapsed 0.0262 seconds (with stemming 0.5347 seconds)

**87/90 file processed** - time elapsed 0.0184 seconds (with stemming 0.3901 seconds)

**88/90 file processed** - time elapsed 0.0224 seconds (with stemming 0.6750 seconds)

**89/90 file processed** - time elapsed 0.0491 seconds (with stemming 1.2108 seconds)

**90/90 file processed** - time elapsed 0.1167 seconds (with stemming 2.5950 seconds)