Name : Riza James Peter

Assignment no:07  
Text Analytics
1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document
Frequency.

In [1]:
#Step1: Importing the packages
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
#Step2: Defining paragraph
paragraph = """I was lucky to have worked with all three of them closely and consider this the great opportunity of my life.
               I see four milestones in my career"""

In [3]:
#Step3: Tokenization
tokens = word_tokenize(paragraph)
tokens

['I',
 'was',
 'lucky',
 'to',
 'have',
 'worked',
 'with',
 'all',
 'three',
 'of',
 'them',
 'closely',
 'and',
 'consider',
 'this',
 'the',
 'great',
 'opportunity',
 'of',
 'my',
 'life',
 '.',
 'I',
 'see',
 'four',
 'milestones',
 'in',
 'my',
 'career']

In [4]:
# Step 4: POS Tagging
pos_tags = pos_tag(tokens)
pos_tags

[('I', 'PRP'),
 ('was', 'VBD'),
 ('lucky', 'JJ'),
 ('to', 'TO'),
 ('have', 'VB'),
 ('worked', 'VBN'),
 ('with', 'IN'),
 ('all', 'DT'),
 ('three', 'CD'),
 ('of', 'IN'),
 ('them', 'PRP'),
 ('closely', 'RB'),
 ('and', 'CC'),
 ('consider', 'VB'),
 ('this', 'DT'),
 ('the', 'DT'),
 ('great', 'JJ'),
 ('opportunity', 'NN'),
 ('of', 'IN'),
 ('my', 'PRP$'),
 ('life', 'NN'),
 ('.', '.'),
 ('I', 'PRP'),
 ('see', 'VBP'),
 ('four', 'CD'),
 ('milestones', 'NNS'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('career', 'NN')]

In [5]:
# Step 5: Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
tokens= word_tokenize(paragraph)
filtered_tokens =[]
for token in tokens:
  if (token not in stop_words):
    filtered_tokens.append(token)
print(filtered_tokens)

['I', 'lucky', 'worked', 'three', 'closely', 'consider', 'great', 'opportunity', 'life', '.', 'I', 'see', 'four', 'milestones', 'career']


In [6]:
#Step 6: Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
stemmed_tokens

['i',
 'lucki',
 'work',
 'three',
 'close',
 'consid',
 'great',
 'opportun',
 'life',
 '.',
 'i',
 'see',
 'four',
 'mileston',
 'career']

In [7]:
#Step7 : Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word, pos='v') for word in filtered_tokens]  # specifying 'v' for verb lemmatization
lemmatized_tokens

['I',
 'lucky',
 'work',
 'three',
 'closely',
 'consider',
 'great',
 'opportunity',
 'life',
 '.',
 'I',
 'see',
 'four',
 'milestones',
 'career']

In [8]:
# Step 8: Calculate TF (Term Frequency)
def calculate_tf(document):
    words = document.split()
    word_count = len(words)
    term_frequency = {}
    for word in words:
        term_frequency[word] = term_frequency.get(word, 0) + 1 / word_count
    return term_frequency

tf = calculate_tf(paragraph)


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
import math
from collections import Counter

In [33]:
paragraph = ["""I was lucky to have worked with all three of them closely and consider this the great opportunity of my life.
               I see four milestones in my career"""]

In [34]:
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(paragraph)
vectorizer.vocabulary_

{'was': 21,
 'lucky': 10,
 'to': 20,
 'have': 7,
 'worked': 23,
 'with': 22,
 'all': 0,
 'three': 19,
 'of': 13,
 'them': 17,
 'closely': 3,
 'and': 1,
 'consider': 4,
 'this': 18,
 'the': 16,
 'great': 6,
 'opportunity': 14,
 'my': 12,
 'life': 9,
 'see': 15,
 'four': 5,
 'milestones': 11,
 'in': 8,
 'career': 2}

In [35]:
# Example list of documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

In [36]:
# Tokenize each document
tokenized_documents = [document.lower().split() for document in documents]
# Calculate TF for each document
tf = [Counter(document) for document in tokenized_documents]


In [38]:
# Calculate DF for each term
df = Counter()
for document in tokenized_documents:
    df.update(set(document))


In [39]:
# Calculate IDF for each term
idf = {}
total_documents = len(documents)
for term in df:
    idf[term] = math.log(total_documents / (df[term] + 1))

In [41]:
# Calculate TF-IDF for each term in each document
tfidf_representation = [{term: tf_document.get(term, 0) * idf[term] for term in set(document)} for tf_document, document in zip(tf, tokenized_documents)]


# Print TF-IDF representation
print("TF-IDF Representation of the Documents:")
for i, tfidf_document in enumerate(tfidf_representation):
    print(f"Document {i + 1}:")
    print(tfidf_document)

TF-IDF Representation of the Documents:
Document 1:
{'first': 0.28768207245178085, 'is': -0.2231435513142097, 'document.': 0.28768207245178085, 'the': -0.2231435513142097, 'this': -0.2231435513142097}
Document 2:
{'is': -0.2231435513142097, 'document': 0.6931471805599453, 'document.': 0.28768207245178085, 'the': -0.2231435513142097, 'second': 0.6931471805599453, 'this': -0.2231435513142097}
Document 3:
{'is': -0.2231435513142097, 'one.': 0.6931471805599453, 'this': -0.2231435513142097, 'and': 0.6931471805599453, 'the': -0.2231435513142097, 'third': 0.6931471805599453}
Document 4:
{'first': 0.28768207245178085, 'is': -0.2231435513142097, 'the': -0.2231435513142097, 'document?': 0.6931471805599453, 'this': -0.2231435513142097}
