# Text preprocessing using Spacy

In "Spacy", there are a model for each language; it has to be downloaded. To process a language, we have to load its model. The API works with pipes of tasks. By default, all pipes are loaded.

In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [2]:
# to desable some pipes
nlp.select_pipes(disable=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])
nlp.pipe_names

[]

## I. Sentence tokenization

In [3]:
text = 'This is a text written by Mr. Aries. It uses U.S. english to illustrate sentence tokenization.'

nlp.add_pipe('sentencizer')

doc = nlp(text)

sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)

sents_list

['This is a text written by Mr. Aries.',
 'It uses U.S. english to illustrate sentence tokenization.']

## II. Words tokenization

It is automatic when calling **nlp(text)**.

In [4]:
tokens = []
for word in doc:
    tokens.append(word.text)

tokens

['This',
 'is',
 'a',
 'text',
 'written',
 'by',
 'Mr.',
 'Aries',
 '.',
 'It',
 'uses',
 'U.S.',
 'english',
 'to',
 'illustrate',
 'sentence',
 'tokenization',
 '.']

## III. StopWords filtering

In [5]:
filtered_tokens = []
for word in doc:
    if word.is_stop==False:
        filtered_tokens.append(word.text)

filtered_tokens

['text',
 'written',
 'Mr.',
 'Aries',
 '.',
 'uses',
 'U.S.',
 'english',
 'illustrate',
 'sentence',
 'tokenization',
 '.']

## IV. Lemmatization

In [6]:
text = 'This is a text written by Mr. Aries. It uses U.S. english to illustrate sentence tokenization.'

#nlp.add_pipe('lemmatizer') # exists already
nlp.enable_pipe('tagger')
nlp.enable_pipe('tok2vec')# apparently it uses this as well
nlp.enable_pipe('attribute_ruler')
nlp.enable_pipe('lemmatizer') # lemmatizer must use tagger + attribute ruler OR morphologizer

print(nlp.pipe_names)

doc = nlp(text)

lemmas_list = []
for word in doc:
    lemmas_list.append((word.text, word.lemma_))

lemmas_list

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'sentencizer']


[('This', 'this'),
 ('is', 'be'),
 ('a', 'a'),
 ('text', 'text'),
 ('written', 'write'),
 ('by', 'by'),
 ('Mr.', 'Mr.'),
 ('Aries', 'Aries'),
 ('.', '.'),
 ('It', 'it'),
 ('uses', 'use'),
 ('U.S.', 'U.S.'),
 ('english', 'english'),
 ('to', 'to'),
 ('illustrate', 'illustrate'),
 ('sentence', 'sentence'),
 ('tokenization', 'tokenization'),
 ('.', '.')]