# Traditional Text Representation and Feature Engineering

## Common Imports

In [None]:
import pandas as pd
import re
import numpy as np

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

import nltk
from nltk import sent_tokenize, word_tokenize
STOPWORDS = nltk.corpus.stopwords.words('english')

## One-Hot Representation

In [None]:
sample = '''This will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine 
everywhere on Tuesday and temperatures of between 22 and 27 degrees. It will warmest in the midlands. Temperatures 
could reach a September record for the century in Ireland, but are unlikely to surpass the 29.1 degrees recorded 
at Kildare’s Clongowes Wood College on September 1st, 1906. Tuesday, however, will be the last day of the sunshine 
with rain arriving across the country on Wednesday morning. Temperatures will remain as high as 24 degrees with the 
warmth punctuated by heavy showers.'''

In [None]:
cleaned_sample = re.sub("[^A-Za-z0-9\s.]", "" , sample.replace('\n', '').lower())
cleaned_sample          

In [None]:
tokens_docs = [word_tokenize(doc) for doc in sent_tokenize(cleaned_sample)]
print(tokens_docs)

In [None]:
word_to_id = {token: idx for idx, token in enumerate(set(word_tokenize(cleaned_sample)))}
word_to_id

In [None]:
token_ids = [[word_to_id[token] for token in tokens_doc] for tokens_doc in tokens_docs]
print(token_ids)

In [None]:
num_words = len(word_to_id)
num_words

In [None]:
encoded_sequences = []
for each_seq in token_ids:
    encoded_tokens = []
    for each_token_id in each_seq:
        a = np.zeros((1, num_words))
        a[0, each_token_id] = 1
        encoded_tokens.append(a)
    encoded_sequences.append(encoded_tokens)

In [None]:
encoded_sequences

## Bag-of-Words (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(stop_words=STOPWORDS)

In [None]:
vectorizer.fit([sample])

In [None]:
vectorizer.vocabulary_ 

**Note:**

A mapping of terms to feature indices. Logic is similar to word_to_id in One Hot Encoding. Since the count vectorizer was initiated with instrcuting to remove the stopwords, the vocab size decreased.

In [None]:
len(vectorizer.vocabulary_)

In [None]:
cv_vector = vectorizer.transform([sample])

In [None]:
cv_vector.shape

In [None]:
cv_vector.toarray()

The size of the array is the final vocab size (obtained with vectorizer.vocabulary_) and the counts of the tokens' in the text is displayed. For example, the tokne at 14th position appeared twice. In the vocab, at 14 we have the word 'day' and it appears twice in the sample text. Similarly, the word 'degrees' occur thrice, and is in the 15th position which indicates '3' in the cv_vector array.

**Sample Text**

'This will be followed by more of the same with the mist and fog clearing to give a **day** of unbroken sunshine everywhere on Tuesday and temperatures of between 22 and 27 degrees. It will warmest in the midlands. Temperatures could reach a September record for the century in Ireland, but are unlikely to surpass the 29.1 degrees recorded at Kildare’s Clongowes Wood College on September 1st, 1906. Tuesday, however, will be the last **day** of the sunshine with rain arriving across the country on Wednesday morning. Temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers.'

## N-Grams

In [None]:
vectorizer = CountVectorizer(ngram_range=(2,2))
cv_vector = vectorizer.fit_transform([sample])

In [None]:
cv_vector = cv_vector.toarray()
vocab = vectorizer.get_feature_names()
pd.DataFrame(cv_vector, columns=vocab)

In [None]:
from nltk import ngrams
n = 3

for grams in ngrams(word_tokenize(sample), n):
    print(grams)

## Term Frequency - Inverse Document Frequent

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf_idf = TfidfVectorizer(min_df=0., max_df=1., use_idf=True, stop_words=STOPWORDS)

In [None]:
tfidf_vector = tf_idf.fit([sample])
tfidf_vector.vocabulary_

In [None]:
tfidf_vector = tf_idf.transform([sample])
tfidf_vector.toarray()

In [None]:
vocab = tf_idf.get_feature_names()
pd.DataFrame(np.round(tfidf_vector.toarray(), 2), columns=vocab)

## Count Vectorizer and Tf-idf on sent_tokenized(sample)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer() # Not using stopwords to demo the behavior with multiple token occurrence in one document

In [None]:
vectorizer.fit(sent_tokenize(cleaned_sample)) # here each element in the sent_tokenize list is a doc

In [None]:
vectorizer.transform(sent_tokenize(cleaned_sample)).toarray()

In [None]:
encoded_vector = vectorizer.transform(sent_tokenize(cleaned_sample)).toarray()
len(encoded_vector[0])

In [None]:
len(vectorizer.vocabulary_)

In [None]:
vectorizer.vocabulary_

**Note:**

All document is of same length as the vocabulary.

Explanation - 
1. 1st document, 1st word or position 0 = 0 ; because the word at index 0 is 1906 in the vocabulary. 1906 is not there in this document. Thus, it is zero.

2. 1st document, 2nd word or position 1 = 0 ; because the word at index 1 is 1st in the vocabulary. 1st is not there in this document. Thus, it is zero.

3. 1st document, 3rd word or position 2 = 1 ; because the word at index 2 is 22 in the vocabulary. 22 is there in this document. Thus, it is 1.

3. 1st document, 8th word or position 7  = 3 ; because the word at index 7 is 'and' in the vocabulary. 'and' is there in this document thrice. Thus, it is 3.

**Ref**

'This will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine \neverywhere on Tuesday and temperatures of between 22 and 27 degrees.',

 'It will warmest in the midlands.',
 
 'Temperatures \ncould reach a September record for the century in Ireland, but are unlikely to surpass the 29.1 degrees recorded \nat Kildare’s Clongowes Wood College on September 1st, 1906.',
 
 'Tuesday, however, will be the last day of the sunshine \nwith rain arriving across the country on Wednesday morning.',
 'Temperatures will remain as high as 24 degrees with the \nwarmth punctuated by heavy showers.'

In [None]:
tf_idf = TfidfVectorizer()
tf_idf.fit(sent_tokenize(cleaned_sample))

In [None]:
tf_idf.transform(sent_tokenize(cleaned_sample)).toarray()

In [None]:
vocab = tf_idf.get_feature_names()
pd.DataFrame(np.round(tf_idf.transform(sent_tokenize(cleaned_sample)).toarray(), 2), columns=vocab)