In [None]:
pip install nltk



In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
documents=[
    'This is sample document for testing the text similarity',
    'we will use nltk for computing similarity of text',
    'NLTK is powerful library for natural language processing'
]

In [5]:
def preprocess_and_stem(text):
    """Preprocess a single text document and apply stemming."""

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    # Tokenize the text
    tokens = word_tokenize(text)

    # Preprocess: Lowercase, remove punctuation, remove stopwords, apply stemming
    processed_tokens = [
        stemmer.stem(word.lower())
        for word in tokens
        if word.isalpha() and word.lower() not in stop_words
    ]

    return processed_tokens
def preprocess_documents(documents):
    """Preprocess a list of text documents."""
    return [preprocess_and_stem(doc) for doc in documents]


In [6]:
preprocessed_doc = preprocess_documents(documents)

for i, doc in enumerate(preprocessed_doc):
    print(f"Document {i+1}: {doc}")


Document 1: ['sampl', 'document', 'test', 'text', 'similar']
Document 2: ['use', 'nltk', 'comput', 'similar', 'text']
Document 3: ['nltk', 'power', 'librari', 'natur', 'languag', 'process']


In [7]:
import pandas as pd

# Convert tokenized words back into string format for TfidfVectorizer
processed_documents_str = [" ".join(doc) for doc in preprocessed_doc]
print(processed_documents_str)
# Initialize and fit TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_documents_str)

# Get feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Converting to dataframe fr readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Display the TF-IDF matrix
print("TF-IDF Matrix:")
print(tfidf_df)


['sampl document test text similar', 'use nltk comput similar text', 'nltk power librari natur languag process']
TF-IDF Matrix:
    comput  document   languag   librari     natur      nltk     power  \
0  0.00000  0.490479  0.000000  0.000000  0.000000  0.000000  0.000000   
1  0.51742  0.000000  0.000000  0.000000  0.000000  0.393511  0.000000   
2  0.00000  0.000000  0.423394  0.423394  0.423394  0.322002  0.423394   

    process     sampl   similar      test      text      use  
0  0.000000  0.490479  0.373022  0.490479  0.373022  0.00000  
1  0.000000  0.000000  0.393511  0.000000  0.393511  0.51742  
2  0.423394  0.000000  0.000000  0.000000  0.000000  0.00000  


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute Cosine Similarity
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Convert the Cosine Similarity matrix to a DataFrame for better readability
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=range(1, len(processed_documents_str) + 1), columns=range(1, len(processed_documents_str) + 1))

# Display the Cosine Similarity Matrix
print("Cosine Similarity Matrix:")
print(cosine_sim_df)

Cosine Similarity Matrix:
          1         2         3
1  1.000000  0.293577  0.000000
2  0.293577  1.000000  0.126712
3  0.000000  0.126712  1.000000


In [None]:
from sklearn.metrics import jaccard_score
import numpy as np

# Convert tokenized documents to binary bag-of-words representation
def binary_bag_of_words(documents, vocabulary):
    """Convert tokenized documents into binary vectors based on a given vocabulary."""
    return np.array([[1 if word in doc else 0 for word in vocabulary] for doc in documents])

# Get unique words across all documents (vocabulary)
unique_words = set(word for doc in preprocessed_doc for word in doc)

# Convert documents to binary bag-of-words format
binary_matrix = binary_bag_of_words(preprocessed_doc, unique_words)

# Compute Jaccard Similarity for each pair of documents
num_docs = len(preprocessed_doc)
jaccard_sim_matrix = np.zeros((num_docs, num_docs))

for i in range(num_docs):
    for j in range(num_docs):
        jaccard_sim_matrix[i][j] = jaccard_score(binary_matrix[i], binary_matrix[j])

# Convert to a DataFrame for better readability
jaccard_sim_df = pd.DataFrame(
    jaccard_sim_matrix,
    index=range(1, num_docs + 1),
    columns=range(1, num_docs + 1)
)

# Display the Jaccard Similarity Matrix
print("Jaccard Similarity Matrix:")
print(jaccard_sim_df)


Jaccard Similarity Matrix:
      1     2    3
1  1.00  0.25  0.0
2  0.25  1.00  0.1
3  0.00  0.10  1.0
