In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
import math

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\saura\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saura\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
file_path = 'sample.txt'
with open(file_path, 'r') as file:
    text = file.read()

In [9]:
df = pd.DataFrame({'text': [text]})

In [10]:
document = df['text'][0]

In [11]:
# Tokenization
tokens = word_tokenize(document)

In [12]:
print("Tokenization:", tokens)

Tokenization: ['Text', 'analytics', 'is', 'the', 'process', 'of', 'analyzing', 'unstructured', 'text', 'data', 'to', 'derive', 'meaningful', 'insights', '.', 'It', 'involves', 'various', 'preprocessing', 'steps', 'such', 'as', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', '.', 'Once', 'the', 'text', 'is', 'preprocessed', ',', 'we', 'can', 'calculate', 'term', 'frequency', 'and', 'inverse', 'document', 'frequency', 'to', 'represent', 'the', 'document', '.']


In [13]:
# POS Tagging
pos_tags = nltk.pos_tag(tokens)


In [14]:
print("POS Tagging:", pos_tags)

POS Tagging: [('Text', 'NN'), ('analytics', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('analyzing', 'VBG'), ('unstructured', 'JJ'), ('text', 'NN'), ('data', 'NNS'), ('to', 'TO'), ('derive', 'VB'), ('meaningful', 'JJ'), ('insights', 'NNS'), ('.', '.'), ('It', 'PRP'), ('involves', 'VBZ'), ('various', 'JJ'), ('preprocessing', 'VBG'), ('steps', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('tokenization', 'NN'), (',', ','), ('POS', 'NNP'), ('tagging', 'NN'), (',', ','), ('stop', 'VB'), ('words', 'NNS'), ('removal', 'JJ'), (',', ','), ('stemming', 'VBG'), (',', ','), ('and', 'CC'), ('lemmatization', 'NN'), ('.', '.'), ('Once', 'IN'), ('the', 'DT'), ('text', 'NN'), ('is', 'VBZ'), ('preprocessed', 'VBN'), (',', ','), ('we', 'PRP'), ('can', 'MD'), ('calculate', 'VB'), ('term', 'NN'), ('frequency', 'NN'), ('and', 'CC'), ('inverse', 'JJ'), ('document', 'NN'), ('frequency', 'NN'), ('to', 'TO'), ('represent', 'VB'), ('the', 'DT'), ('document', 'NN'), ('.', '.')]


In [15]:
# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

In [16]:
print("Filtered Tokens:", filtered_tokens)

Filtered Tokens: ['Text', 'analytics', 'process', 'analyzing', 'unstructured', 'text', 'data', 'derive', 'meaningful', 'insights', '.', 'involves', 'various', 'preprocessing', 'steps', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'lemmatization', '.', 'text', 'preprocessed', ',', 'calculate', 'term', 'frequency', 'inverse', 'document', 'frequency', 'represent', 'document', '.']


In [17]:
# Stemming
ps = PorterStemmer()
stemmed_tokens = [ps.stem(word) for word in filtered_tokens]

In [18]:
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['text', 'analyt', 'process', 'analyz', 'unstructur', 'text', 'data', 'deriv', 'meaning', 'insight', '.', 'involv', 'variou', 'preprocess', 'step', 'token', ',', 'po', 'tag', ',', 'stop', 'word', 'remov', ',', 'stem', ',', 'lemmat', '.', 'text', 'preprocess', ',', 'calcul', 'term', 'frequenc', 'invers', 'document', 'frequenc', 'repres', 'document', '.']


In [19]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

In [20]:
print("Lemmatized Tokens:", lemmatized_tokens)

Lemmatized Tokens: ['Text', 'analytics', 'process', 'analyzing', 'unstructured', 'text', 'data', 'derive', 'meaningful', 'insight', '.', 'involves', 'various', 'preprocessing', 'step', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'word', 'removal', ',', 'stemming', ',', 'lemmatization', '.', 'text', 'preprocessed', ',', 'calculate', 'term', 'frequency', 'inverse', 'document', 'frequency', 'represent', 'document', '.']


In [23]:
# Calculate Term Frequency (TF)
tf = FreqDist(stemmed_tokens)

In [24]:
print("TF:", tf)

TF: <FreqDist with 29 samples and 40 outcomes>


In [25]:
# Calculate Inverse Document Frequency (IDF)
idf = {}
total_documents = 1  # Assuming we have only one document
for term in tf.keys():
    doc_freq = sum(1 for doc in [stemmed_tokens] if term in doc)
    idf[term] = math.log(total_documents / (doc_freq + 1))

In [26]:
print("IDF:", idf)

IDF: {'text': -0.6931471805599453, 'analyt': -0.6931471805599453, 'process': -0.6931471805599453, 'analyz': -0.6931471805599453, 'unstructur': -0.6931471805599453, 'data': -0.6931471805599453, 'deriv': -0.6931471805599453, 'meaning': -0.6931471805599453, 'insight': -0.6931471805599453, '.': -0.6931471805599453, 'involv': -0.6931471805599453, 'variou': -0.6931471805599453, 'preprocess': -0.6931471805599453, 'step': -0.6931471805599453, 'token': -0.6931471805599453, ',': -0.6931471805599453, 'po': -0.6931471805599453, 'tag': -0.6931471805599453, 'stop': -0.6931471805599453, 'word': -0.6931471805599453, 'remov': -0.6931471805599453, 'stem': -0.6931471805599453, 'lemmat': -0.6931471805599453, 'calcul': -0.6931471805599453, 'term': -0.6931471805599453, 'frequenc': -0.6931471805599453, 'invers': -0.6931471805599453, 'document': -0.6931471805599453, 'repres': -0.6931471805599453}


In [27]:
# Calculate TF-IDF
tfidf = {}
for term, freq in tf.items():
    tfidf[term] = freq * idf[term]

In [28]:
print("TF-IDF:", tfidf)

TF-IDF: {'text': -2.0794415416798357, 'analyt': -0.6931471805599453, 'process': -0.6931471805599453, 'analyz': -0.6931471805599453, 'unstructur': -0.6931471805599453, 'data': -0.6931471805599453, 'deriv': -0.6931471805599453, 'meaning': -0.6931471805599453, 'insight': -0.6931471805599453, '.': -2.0794415416798357, 'involv': -0.6931471805599453, 'variou': -0.6931471805599453, 'preprocess': -1.3862943611198906, 'step': -0.6931471805599453, 'token': -0.6931471805599453, ',': -3.4657359027997265, 'po': -0.6931471805599453, 'tag': -0.6931471805599453, 'stop': -0.6931471805599453, 'word': -0.6931471805599453, 'remov': -0.6931471805599453, 'stem': -0.6931471805599453, 'lemmat': -0.6931471805599453, 'calcul': -0.6931471805599453, 'term': -0.6931471805599453, 'frequenc': -1.3862943611198906, 'invers': -0.6931471805599453, 'document': -1.3862943611198906, 'repres': -0.6931471805599453}
