# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [34]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

Current working directory: /workspaces/data_analytics/Submissions/Week_11


[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Defining documents

In [35]:
# Defining documents (=sentenses)
d1 = 'The main ingredients of pasta are eggs and flour.'
d2 = 'Knead the flower and eggs into a dough.'
d3 = 'Run your dough through the pasta maker.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'The main ingredients of pasta are eggs and flour. Knead the flower and eggs into a dough. Run your dough through the pasta maker.'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [36]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'the main ingredients of pasta are eggs and flour. knead the flower and eggs into a dough. run your dough through the pasta maker.'

### Removing punctuation

In [37]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'the main ingredients of pasta are eggs and flour knead the flower and eggs into a dough run your dough through the pasta maker'

### Tokenize text & removal of stopwords

In [38]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{'ourselves', 'm', 'each', "wouldn't", 'their', 'above', 'few', "haven't", 'from', 'ma', "aren't", 'why', 'them', "she's", 'do', 'there', 'o', 'its', 'themselves', 'our', 'at', 'where', 'same', 'herself', 'just', "mustn't", 's', "won't", 'theirs', 'a', 'doesn', 'll', 'yourself', 'of', 'such', 'isn', 'after', 'other', "hadn't", 'the', 'both', 'hasn', "you've", 'very', 'or', 'those', 'than', 'some', 'be', 'but', 'off', 'itself', 'are', 'doing', 'as', 'she', 'up', 'here', 'her', 'will', 'only', 'and', 'having', 'had', 'an', 'under', 'these', 'your', 'yourselves', 'during', 'own', 'yours', 'by', "don't", "doesn't", "you'd", "mightn't", 'was', 'i', "you'll", 'nor', 'wasn', 'while', 'now', 'out', 've', "it's", 'you', 'no', 'too', 'can', "shouldn't", "that'll", 'did', 'into', 'they', 'which', "needn't", 'me', 'most', 'am', 'needn', 'to', 'wouldn', 'should', 'in', 'that', 'he', "wasn't", "isn't", 're', "weren't", 'hers', 'how', 'ours', 'y', 'mustn', 'been', 'any', 'm

In [39]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['main', 'ingredients', 'pasta', 'eggs', 'flour', 'knead', 'flower', 'eggs', 'dough', 'run', 'dough', 'pasta', 'maker']

### Lemmatization

In [40]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['main', 'ingredients', 'pasta', 'eggs', 'flour', 'knead', 'flower', 'eggs', 'dough', 'run', 'dough', 'pasta', 'maker'] 

After lemmatization:
['main', 'ingredients', 'pasta', 'egg', 'flour', 'knead', 'flower', 'egg', 'dough', 'run', 'dough', 'pasta', 'maker']

## Redefine the text corpus (pre-processed)

In [49]:
# We will use the lemmatized words above to re-define our corpus 
corpus = ['main', 'ingredients', 'pasta', 'egg', 'flour', 'knead', 'flower', 'egg', 'dough', 'run', 'dough', 'pasta', 'maker']

## Document-term matrix with ngram_range=(1,1)

In [50]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
    dough  egg  flour  flower  ingredients  knead  main  maker  pasta  run
0       0    0      0       0            0      0     1      0      0    0
1       0    0      0       0            1      0     0      0      0    0
2       0    0      0       0            0      0     0      0      1    0
3       0    1      0       0            0      0     0      0      0    0
4       0    0      1       0            0      0     0      0      0    0
5       0    0      0       0            0      1     0      0      0    0
6       0    0      0       1            0      0     0      0      0    0
7       0    1      0       0            0      0     0      0      0    0
8       1    0      0       0            0      0     0      0      0    0
9       0    0      0       0            0      0     0      0      0    1
10      1    0      0       0            0      0     0      0      0    0
11      0    0      0       0            0      0     0      0      1    0
12  

## Document-term matrix with ngram_range=(2,2)

In [53]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

corpus = ['main ingredients', 'pasta egg', 'flour knead', 'flower egg', 'dough run', 'dough pasta maker']


# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   dough pasta  dough run  flour knead  flower egg  main ingredients  \
0            0          0            0           0                 1   
1            0          0            0           0                 0   
2            0          0            1           0                 0   
3            0          0            0           1                 0   
4            0          1            0           0                 0   
5            1          0            0           0                 0   

   pasta egg  pasta maker  
0          0            0  
1          1            0  
2          0            0  
3          0            0  
4          0            0  
5          0            1  


## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [54]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 10 

The words in the corpus: 
 {'main', 'ingredients', 'egg', 'flour', 'dough', 'knead', 'flower', 'run', 'pasta', 'maker'}

Term Frequency (TF):
   main  ingredients  egg  flour   dough  knead  flower  run   pasta   maker
0   0.5          0.5  0.0    0.0  0.0000    0.0     0.0  0.0  0.0000  0.0000
1   0.0          0.0  0.5    0.0  0.0000    0.0     0.0  0.0  0.5000  0.0000
2   0.0          0.0  0.0    0.5  0.0000    0.5     0.0  0.0  0.0000  0.0000
3   0.0          0.0  0.5    0.0  0.0000    0.0     0.5  0.0  0.0000  0.0000
4   0.0          0.0  0.0    0.0  0.5000    0.0     0.0  0.5  0.0000  0.0000
5   0.0          0.0  0.0    0.0  0.3333    0.0     0.0  0.0  0.3333  0.3333


### Inverse Document Frequency (IDF)

In [55]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
           main:     0.7782
    ingredients:     0.7782
            egg:     0.4771
          flour:     0.7782
          dough:     0.4771
          knead:     0.7782
         flower:     0.7782
            run:     0.7782
          pasta:     0.4771
          maker:     0.7782


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [56]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
     main  ingredients     egg   flour   dough   knead  flower     run  \
0  0.3891       0.3891  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000   
1  0.0000       0.0000  0.2386  0.0000  0.0000  0.0000  0.0000  0.0000   
2  0.0000       0.0000  0.0000  0.3891  0.0000  0.3891  0.0000  0.0000   
3  0.0000       0.0000  0.2386  0.0000  0.0000  0.0000  0.3891  0.0000   
4  0.0000       0.0000  0.0000  0.0000  0.2386  0.0000  0.0000  0.3891   
5  0.0000       0.0000  0.0000  0.0000  0.1590  0.0000  0.0000  0.0000   

    pasta   maker  
0  0.0000  0.0000  
1  0.2386  0.0000  
2  0.0000  0.0000  
3  0.0000  0.0000  
4  0.0000  0.0000  
5  0.1590  0.2594  


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [58]:
text = '''One morning, when Gregor Samsa woke from troubled
            dreams, he found himself transformed in his bed into a
            horrible vermin.'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('One', 'CD', 'O'),
 ('morning', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('when', 'WRB', 'O'),
 ('Gregor', 'NNP', 'O'),
 ('Samsa', 'NNP', 'O'),
 ('woke', 'VBD', 'O'),
 ('from', 'IN', 'O'),
 ('troubled', 'JJ', 'O'),
 ('dreams', 'NNS', 'O'),
 (',', ',', 'O'),
 ('he', 'PRP', 'O'),
 ('found', 'VBD', 'O'),
 ('himself', 'PRP', 'O'),
 ('transformed', 'VBN', 'O'),
 ('in', 'IN', 'O'),
 ('his', 'PRP$', 'O'),
 ('bed', 'NN', 'B-NP'),
 ('into', 'IN', 'O'),
 ('a', 'DT', 'B-NP'),
 ('horrible', 'JJ', 'I-NP'),
 ('vermin', 'NN', 'I-NP'),
 ('.', '.', 'O')]


### Task 1f
| POS                       | Desc                                          |
|---------------------------|-----------------------------------------------|
| ('One', 'CD', 'O')        | Cardinal Number; Outside                      |
| ('morning', 'NN', 'B-NP') | Noun, singular or mass; Beginning-Noun Phrase |
| (',', ',', 'O')           | Punctuation; Outside                          |
| ('when', 'WRB', 'O')      | Wh adverb; Outside                            |
| ('Gregor', 'NNP', 'O')    | Proper noun, singular; Outside                |

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [60]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.2.0-1016-azure
Datetime: 2023-11-28 08:12:35
Python Version: 3.10.13
-----------------------------------
