# 7. Text Analytics
1. Extract Sample document and apply following document preprocessing
methods: Tokenization, POS Tagging, stop words removal, Stemming and
Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse
Document Frequency.

In [2]:
import nltk

In [4]:
doc = """
C is an imperative, procedural language in the ALGOL tradition. It has a static type system. In C, all executable code is contained within subroutines (also called "functions", though not in the sense of functional programming). Function parameters are passed by value, although arrays are passed as pointers, i.e. the address of the first item in the array. Pass-by-reference is simulated in C by explicitly passing pointers to the thing being referenced. 
"""
doc

'\nC is an imperative, procedural language in the ALGOL tradition. It has a static type system. In C, all executable code is contained within subroutines (also called "functions", though not in the sense of functional programming). Function parameters are passed by value, although arrays are passed as pointers, i.e. the address of the first item in the array. Pass-by-reference is simulated in C by explicitly passing pointers to the thing being referenced. \n'

In [26]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/student/nltk_data...


True

## 7.1. Document Preprocessing

### 7.1.1. Tokenization

In [17]:
# word tokenization
word_tokens = nltk.word_tokenize( doc )
print( word_tokens )

['C', 'is', 'an', 'imperative', ',', 'procedural', 'language', 'in', 'the', 'ALGOL', 'tradition', '.', 'It', 'has', 'a', 'static', 'type', 'system', '.', 'In', 'C', ',', 'all', 'executable', 'code', 'is', 'contained', 'within', 'subroutines', '(', 'also', 'called', '``', 'functions', "''", ',', 'though', 'not', 'in', 'the', 'sense', 'of', 'functional', 'programming', ')', '.', 'Function', 'parameters', 'are', 'passed', 'by', 'value', ',', 'although', 'arrays', 'are', 'passed', 'as', 'pointers', ',', 'i.e', '.', 'the', 'address', 'of', 'the', 'first', 'item', 'in', 'the', 'array', '.', 'Pass-by-reference', 'is', 'simulated', 'in', 'C', 'by', 'explicitly', 'passing', 'pointers', 'to', 'the', 'thing', 'being', 'referenced', '.']


In [10]:
# sentence tokenization 
sent_tokens = nltk.sent_tokenize( doc )
print( sent_tokens )

['\nC is an imperative, procedural language in the ALGOL tradition.', 'It has a static type system.', 'In C, all executable code is contained within subroutines (also called "functions", though not in the sense of functional programming).', 'Function parameters are passed by value, although arrays are passed as pointers, i.e.', 'the address of the first item in the array.', 'Pass-by-reference is simulated in C by explicitly passing pointers to the thing being referenced.']


### 7.1.2. Stop word removal


In [18]:
stop_words = set(nltk.corpus.stopwords.words('english'))
word_tokens = [ token for token in word_tokens if token not in stop_words ]
print( word_tokens )

['C', 'imperative', ',', 'procedural', 'language', 'ALGOL', 'tradition', '.', 'It', 'static', 'type', 'system', '.', 'In', 'C', ',', 'executable', 'code', 'contained', 'within', 'subroutines', '(', 'also', 'called', '``', 'functions', "''", ',', 'though', 'sense', 'functional', 'programming', ')', '.', 'Function', 'parameters', 'passed', 'value', ',', 'although', 'arrays', 'passed', 'pointers', ',', 'i.e', '.', 'address', 'first', 'item', 'array', '.', 'Pass-by-reference', 'simulated', 'C', 'explicitly', 'passing', 'pointers', 'thing', 'referenced', '.']


### 7.1.3. Parts of Speech Tagging

In [23]:
tags = nltk.pos_tag( word_tokens )
print( tags )

[('C', 'NNP'), ('imperative', 'NN'), (',', ','), ('procedural', 'JJ'), ('language', 'NN'), ('ALGOL', 'NNP'), ('tradition', 'NN'), ('.', '.'), ('It', 'PRP'), ('static', 'JJ'), ('type', 'NN'), ('system', 'NN'), ('.', '.'), ('In', 'IN'), ('C', 'NNP'), (',', ','), ('executable', 'JJ'), ('code', 'NN'), ('contained', 'VBD'), ('within', 'IN'), ('subroutines', 'NNS'), ('(', '('), ('also', 'RB'), ('called', 'VBN'), ('``', '``'), ('functions', 'NNS'), ("''", "''"), (',', ','), ('though', 'IN'), ('sense', 'JJ'), ('functional', 'JJ'), ('programming', 'NN'), (')', ')'), ('.', '.'), ('Function', 'NN'), ('parameters', 'NNS'), ('passed', 'VBD'), ('value', 'NN'), (',', ','), ('although', 'IN'), ('arrays', 'NNS'), ('passed', 'VBN'), ('pointers', 'NNS'), (',', ','), ('i.e', 'NN'), ('.', '.'), ('address', 'NN'), ('first', 'JJ'), ('item', 'NN'), ('array', 'NN'), ('.', '.'), ('Pass-by-reference', 'NN'), ('simulated', 'VBD'), ('C', 'NNP'), ('explicitly', 'RB'), ('passing', 'VBG'), ('pointers', 'NNS'), ('thin

### 7.1.4. Lemmatization

In [27]:
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_tokens = [ lemmatizer.lemmatize( token ) for token in word_tokens ]
print( lemmatized_tokens )

['C', 'imperative', ',', 'procedural', 'language', 'ALGOL', 'tradition', '.', 'It', 'static', 'type', 'system', '.', 'In', 'C', ',', 'executable', 'code', 'contained', 'within', 'subroutine', '(', 'also', 'called', '``', 'function', "''", ',', 'though', 'sense', 'functional', 'programming', ')', '.', 'Function', 'parameter', 'passed', 'value', ',', 'although', 'array', 'passed', 'pointer', ',', 'i.e', '.', 'address', 'first', 'item', 'array', '.', 'Pass-by-reference', 'simulated', 'C', 'explicitly', 'passing', 'pointer', 'thing', 'referenced', '.']


### 7.1.5. Stemming