# Demonstration of the effects of text pre-processing

In [1]:
from __future__ import print_function

import os
import logging
import json
import warnings
import gensim
import nltk

import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import numpy as np

from gensim.models import CoherenceModel, LdaModel
from gensim.corpora import Dictionary

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

import load_lee_background_corpus as load_texts

### Loading the corpus

First we load the raw texts, with no pre-processing, to display the effects of each method on the raw text

In [2]:
texts = load_texts.get_raw_texts()
sample_text = texts[5]
print(sample_text)

/Users/home/Desenvolvimento/anaconda3/lib/python3.6/site-packages/gensim/test/test_data
The Federal Government says it should be safe for Afghani asylum seekers in Australia to return home when the environment becomes secure. The Government has suspended their applications while the interim government is established in Kabul. The Foreign Affairs Minister Alexander Downer has refused to say for how long the claims process has been put on hold. But he says the major threat to most people seeking asylum is no longer there. "Many Afghans who have tried to get into Australia or for that matter into Britain and other countries in north-west Europe have claimed that they are fleeing the Taliban," he said. "Well, the Taliban is no longer in power in Afghanistan, the Taliban is finished." Meanwhile, there has been a mass airlift of detainees from Christmas Island to the Pacific Island of Nauru. In total, more than 300 people have been flown from the island in two operations using chartered airc

### Lowercase, punctuation removal, accents and removal by word length

In [3]:
simple_sample_text = gensim.utils.simple_preprocess(sample_text, deacc=True, min_len=3, max_len=10)
print(simple_sample_text)

['the', 'federal', 'government', 'says', 'should', 'safe', 'for', 'afghani', 'asylum', 'seekers', 'australia', 'return', 'home', 'when', 'the', 'becomes', 'secure', 'the', 'government', 'has', 'suspended', 'their', 'while', 'the', 'interim', 'government', 'kabul', 'the', 'foreign', 'affairs', 'minister', 'alexander', 'downer', 'has', 'refused', 'say', 'for', 'how', 'long', 'the', 'claims', 'process', 'has', 'been', 'put', 'hold', 'but', 'says', 'the', 'major', 'threat', 'most', 'people', 'seeking', 'asylum', 'longer', 'there', 'many', 'afghans', 'who', 'have', 'tried', 'get', 'into', 'australia', 'for', 'that', 'matter', 'into', 'britain', 'and', 'other', 'countries', 'north', 'west', 'europe', 'have', 'claimed', 'that', 'they', 'are', 'fleeing', 'the', 'taliban', 'said', 'well', 'the', 'taliban', 'longer', 'power', 'the', 'taliban', 'finished', 'meanwhile', 'there', 'has', 'been', 'mass', 'airlift', 'detainees', 'from', 'christmas', 'island', 'the', 'pacific', 'island', 'nauru', 'tota

### Stopwords removal

In [4]:
tokens = word_tokenize(sample_text)
print(tokens)
stops = set(stopwords.words('english'))  # nltk stopwords list
filtered_tokens = [word for word in tokens if word not in stops]
print(filtered_tokens)

['The', 'Federal', 'Government', 'says', 'it', 'should', 'be', 'safe', 'for', 'Afghani', 'asylum', 'seekers', 'in', 'Australia', 'to', 'return', 'home', 'when', 'the', 'environment', 'becomes', 'secure', '.', 'The', 'Government', 'has', 'suspended', 'their', 'applications', 'while', 'the', 'interim', 'government', 'is', 'established', 'in', 'Kabul', '.', 'The', 'Foreign', 'Affairs', 'Minister', 'Alexander', 'Downer', 'has', 'refused', 'to', 'say', 'for', 'how', 'long', 'the', 'claims', 'process', 'has', 'been', 'put', 'on', 'hold', '.', 'But', 'he', 'says', 'the', 'major', 'threat', 'to', 'most', 'people', 'seeking', 'asylum', 'is', 'no', 'longer', 'there', '.', '``', 'Many', 'Afghans', 'who', 'have', 'tried', 'to', 'get', 'into', 'Australia', 'or', 'for', 'that', 'matter', 'into', 'Britain', 'and', 'other', 'countries', 'in', 'north-west', 'Europe', 'have', 'claimed', 'that', 'they', 'are', 'fleeing', 'the', 'Taliban', ',', "''", 'he', 'said', '.', '``', 'Well', ',', 'the', 'Taliban',

### Removal of rare and frequent terms

In [5]:
train_texts = load_texts.get_train_texts()
dictionary = Dictionary(train_texts)
print(dictionary)
# Filtra palavras que ocorrem em menos de 5 documentos ou que ocorrem em mais de 50% dos documentos.
dictionary.filter_extremes(no_below=5, no_above=0.5)
print(dictionary)

/Users/home/Desenvolvimento/anaconda3/lib/python3.6/site-packages/gensim/test/test_data


2017-11-22 00:40:54,052 : INFO : collecting all words and their counts
2017-11-22 00:40:54,055 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2017-11-22 00:40:54,098 : INFO : collected 20429 word types from a corpus of 19878 words (unigram + bigrams) and 300 sentences
2017-11-22 00:40:54,100 : INFO : using 20429 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2017-11-22 00:40:56,966 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-11-22 00:40:56,998 : INFO : built Dictionary(4431 unique tokens: ['hundreds', 'people', 'homes', 'southern', 'highlands']...) from 300 documents (total 18861 corpus positions)
2017-11-22 00:40:57,007 : INFO : discarding 3659 tokens: [('southern', 1), ('highlands', 3), ('bushfire', 2), ('hill', 3), ('blaze', 4), ('goulburn', 1), ('closure', 4), ('hume', 2), ('deterioration', 1), ('cell', 2)]...
2017-11-22 00:40:57,009 : INFO : keeping 772 tokens which were in no less than 5 and

Returning 300 training texts
Dictionary(4431 unique tokens: ['hundreds', 'people', 'homes', 'southern', 'highlands']...)
Dictionary(772 unique tokens: ['hundreds', 'people', 'homes', 'new_south', 'wales']...)


### Word removal according to POS tagging

In [6]:
tokens = word_tokenize(sample_text)
tagged = nltk.pos_tag(tokens)
print(tagged)
nouns_only = [tag[0] for tag in tagged if tag[1].startswith('NN')]
print(nouns_only)

[('The', 'DT'), ('Federal', 'NNP'), ('Government', 'NNP'), ('says', 'VBZ'), ('it', 'PRP'), ('should', 'MD'), ('be', 'VB'), ('safe', 'JJ'), ('for', 'IN'), ('Afghani', 'NNP'), ('asylum', 'NN'), ('seekers', 'NNS'), ('in', 'IN'), ('Australia', 'NNP'), ('to', 'TO'), ('return', 'VB'), ('home', 'NN'), ('when', 'WRB'), ('the', 'DT'), ('environment', 'NN'), ('becomes', 'VBZ'), ('secure', 'NN'), ('.', '.'), ('The', 'DT'), ('Government', 'NNP'), ('has', 'VBZ'), ('suspended', 'VBN'), ('their', 'PRP$'), ('applications', 'NNS'), ('while', 'IN'), ('the', 'DT'), ('interim', 'JJ'), ('government', 'NN'), ('is', 'VBZ'), ('established', 'VBN'), ('in', 'IN'), ('Kabul', 'NNP'), ('.', '.'), ('The', 'DT'), ('Foreign', 'NNP'), ('Affairs', 'NNPS'), ('Minister', 'NNP'), ('Alexander', 'NNP'), ('Downer', 'NNP'), ('has', 'VBZ'), ('refused', 'VBN'), ('to', 'TO'), ('say', 'VB'), ('for', 'IN'), ('how', 'WRB'), ('long', 'JJ'), ('the', 'DT'), ('claims', 'NNS'), ('process', 'NN'), ('has', 'VBZ'), ('been', 'VBN'), ('put',

### Adding n-grams

In [7]:
tokens = [word_tokenize(text) for text in texts]
print(tokens[5])
bigram = gensim.models.Phrases(tokens)  # for bigram collocation detection
tokens_with_bigrams = [bigram[token] for token in tokens]
print(tokens_with_bigrams[5])

2017-11-22 00:40:57,820 : INFO : collecting all words and their counts
2017-11-22 00:40:57,822 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2017-11-22 00:40:57,949 : INFO : collected 45122 word types from a corpus of 67065 words (unigram + bigrams) and 300 sentences
2017-11-22 00:40:57,952 : INFO : using 45122 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>


['The', 'Federal', 'Government', 'says', 'it', 'should', 'be', 'safe', 'for', 'Afghani', 'asylum', 'seekers', 'in', 'Australia', 'to', 'return', 'home', 'when', 'the', 'environment', 'becomes', 'secure', '.', 'The', 'Government', 'has', 'suspended', 'their', 'applications', 'while', 'the', 'interim', 'government', 'is', 'established', 'in', 'Kabul', '.', 'The', 'Foreign', 'Affairs', 'Minister', 'Alexander', 'Downer', 'has', 'refused', 'to', 'say', 'for', 'how', 'long', 'the', 'claims', 'process', 'has', 'been', 'put', 'on', 'hold', '.', 'But', 'he', 'says', 'the', 'major', 'threat', 'to', 'most', 'people', 'seeking', 'asylum', 'is', 'no', 'longer', 'there', '.', '``', 'Many', 'Afghans', 'who', 'have', 'tried', 'to', 'get', 'into', 'Australia', 'or', 'for', 'that', 'matter', 'into', 'Britain', 'and', 'other', 'countries', 'in', 'north-west', 'Europe', 'have', 'claimed', 'that', 'they', 'are', 'fleeing', 'the', 'Taliban', ',', "''", 'he', 'said', '.', '``', 'Well', ',', 'the', 'Taliban',

### Stemming and Lemmatizing

In [8]:
stemmer = PorterStemmer()
stemmer.stem("finished")

'finish'

In [9]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
print(wordnet_lemmatizer.lemmatize("dogs"))
print(wordnet_lemmatizer.lemmatize("churches"))
print(wordnet_lemmatizer.lemmatize("aardwolves"))
print(wordnet_lemmatizer.lemmatize("abaci"))
print(wordnet_lemmatizer.lemmatize("hardrock"))
print(wordnet_lemmatizer.lemmatize("are"))
print(wordnet_lemmatizer.lemmatize("is"))

dog
church
aardwolf
abacus
hardrock
are
is


In [10]:
print(tokens[5])
lemmatizer = WordNetLemmatizer()
lemmas = [[lemmatizer.lemmatize(word) for word in doc] for doc in tokens]
print(lemmas[5])

['The', 'Federal', 'Government', 'says', 'it', 'should', 'be', 'safe', 'for', 'Afghani', 'asylum', 'seekers', 'in', 'Australia', 'to', 'return', 'home', 'when', 'the', 'environment', 'becomes', 'secure', '.', 'The', 'Government', 'has', 'suspended', 'their', 'applications', 'while', 'the', 'interim', 'government', 'is', 'established', 'in', 'Kabul', '.', 'The', 'Foreign', 'Affairs', 'Minister', 'Alexander', 'Downer', 'has', 'refused', 'to', 'say', 'for', 'how', 'long', 'the', 'claims', 'process', 'has', 'been', 'put', 'on', 'hold', '.', 'But', 'he', 'says', 'the', 'major', 'threat', 'to', 'most', 'people', 'seeking', 'asylum', 'is', 'no', 'longer', 'there', '.', '``', 'Many', 'Afghans', 'who', 'have', 'tried', 'to', 'get', 'into', 'Australia', 'or', 'for', 'that', 'matter', 'into', 'Britain', 'and', 'other', 'countries', 'in', 'north-west', 'Europe', 'have', 'claimed', 'that', 'they', 'are', 'fleeing', 'the', 'Taliban', ',', "''", 'he', 'said', '.', '``', 'Well', ',', 'the', 'Taliban',