In [8]:
import pandas as pd
import spacy
from collections import Counter

Reading the text

In [5]:
with open('Jane_Austen_Emma.txt', 'r', encoding='utf-8') as file:
    text = file.read()

print(text)

The Project Gutenberg EBook of Emma, by Jane Austen

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org


Title: Emma

Author: Jane Austen

Release Date: August, 1994  [Etext #158]
Posting Date: January 21, 2010
Last Updated: March 10, 2018

Language: English

Character set encoding: UTF-8

*** START OF THIS PROJECT GUTENBERG EBOOK EMMA ***




Produced by An Anonymous Volunteer





EMMA

By Jane Austen




VOLUME I



CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings of
existence; and had lived nearly twenty-one years in the world with very
little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's m

Loading the default model for the English language

In [6]:
nlp = spacy.load("en_core_web_sm")

In [None]:
Tokenization 

In [20]:
doc = nlp(text)

In [9]:
#Loading the stop words 
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

Defining the functions for the preprossing 

In [10]:
def calculate_word_frequencies(doc):
    tokens = [token.text for token in doc if not token.is_punct]
    word_freq = Counter(tokens)
    return word_freq

In [28]:
def preprocessing(doc, word_freq,frequent_threshold,rare_threshold):

    # Removing punctuation
    tokens = [token for token in doc if not token.is_punct]

    # Removing stop words
    tokens = [token for token in tokens if token not in spacy_stopwords]

    # Removing frequent words
    tokens = [token for token in tokens if word_freq[token.text] <= frequent_threshold]

    # Removing rare words 
    tokens = [token for token in tokens if word_freq[token.text] > rare_threshold]

    # Lemmatization 
    cleaned_tokens = [token.lemma_.lower() for token in tokens]

    return cleaned_tokens

Preprocessing the text 

In [29]:
# Calculating word frequencies
word_freq = calculate_word_frequencies(doc)

# Preprocessing the document
preprocessed_doc = preprocessing(doc, word_freq,frequent_threshold=100, rare_threshold=1)

In [30]:
print(preprocessed_doc)
#print("Word Frequencies:", word_freq)

['project', 'gutenberg', 'ebook', 'austen', 'this', 'ebook', 'use', 'anyone', 'anywhere', 'cost', 'almost', 'restriction', 'whatsoever', ' ', 'copy', 're', 'use', 'under', 'term', 'project', 'gutenberg', 'license', 'include', 'ebook', 'online', 'www.gutenberg.org', '\n\n\n', 'austen', 'date', 'august', ' ', 'date', 'january', 'update', 'march', 'english', 'set', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'emma', '\n\n\n\n\n', 'produce', 'an', 'anonymous', 'volunteer', 'emma', 'by', 'austen', '\n\n\n\n\n', 'volume', '\n\n\n\n', 'chapter', '\n\n\n', 'handsome', 'clever', 'rich', 'comfortable', 'disposition', 'unite', 'good', 'blessing', 'existence', 'live', 'nearly', 'twenty', 'year', 'world', 'distress', 'young', 'daughter', 'affectionate', 'indulgent', 'consequence', 'sister', 'marriage', 'mistress', 'house', 'early', 'period', 'her', 'mother', 'die', 'ago', 'remembrance', 'place', 'supply', 'excellent', 'governess', 'fall', 'short', 'mother', 'affection', 'sixteen', 'year

Saving the preprocessed text into a txt file 

In [31]:
# Saving the work in txt file 

output_file_path = 'preprocessed_document.txt'

preprocessed_text = ' '.join(preprocessed_doc)

with open(output_file_path, 'w', encoding='utf-8') as f:
    f.write(preprocessed_text)

print(f'Preprocessed document saved to {output_file_path}')

Preprocessed document saved to preprocessed_document.txt


Another exemple 

In [35]:
document = nlp(
    u'''Sixteen years had Miss Taylor been in Mr. Woodhouse's family, less as a
    governess than a friend, very fond of both daughters, but particularly
    of Emma. Between them it was more the intimacy of sisters. Even before
    Miss Taylor had ceased to hold the nominal office of governess, the
    mildness of her temper had hardly allowed her to impose any restraint;
    and the shadow of authority being now long passed away, they had been
    living together as friend and friend very mutually attached, and Emma
    doing just what she liked; highly esteeming Miss Taylor's judgment, but
    directed chiefly by her own.'''
)

In [36]:
# Calculating word frequencies
word_freq = calculate_word_frequencies(document)

# Preprocessing the document
preprocessed_document = preprocessing(document, word_freq,frequent_threshold=5, rare_threshold=1)

#Printing the result
print("Word Frequencies:", word_freq)
print(preprocessed_document)

Word Frequencies: Counter({'\n    ': 8, 'of': 6, 'had': 4, 'the': 4, 'Miss': 3, 'Taylor': 3, 'friend': 3, 'her': 3, 'and': 3, 'been': 2, "'s": 2, 'as': 2, 'a': 2, 'governess': 2, 'very': 2, 'but': 2, 'Emma': 2, 'to': 2, 'Sixteen': 1, 'years': 1, 'in': 1, 'Mr.': 1, 'Woodhouse': 1, 'family': 1, 'less': 1, 'than': 1, 'fond': 1, 'both': 1, 'daughters': 1, 'particularly': 1, 'Between': 1, 'them': 1, 'it': 1, 'was': 1, 'more': 1, 'intimacy': 1, 'sisters': 1, 'Even': 1, 'before': 1, 'ceased': 1, 'hold': 1, 'nominal': 1, 'office': 1, 'mildness': 1, 'temper': 1, 'hardly': 1, 'allowed': 1, 'impose': 1, 'any': 1, 'restraint': 1, 'shadow': 1, 'authority': 1, 'being': 1, 'now': 1, 'long': 1, 'passed': 1, 'away': 1, 'they': 1, 'living': 1, 'together': 1, 'mutually': 1, 'attached': 1, 'doing': 1, 'just': 1, 'what': 1, 'she': 1, 'liked': 1, 'highly': 1, 'esteeming': 1, 'judgment': 1, 'directed': 1, 'chiefly': 1, 'by': 1, 'own': 1})
['have', 'miss', 'taylor', 'be', "'s", 'as', 'a', 'governess', 'a', 'f