In [1]:
# Install and import libraries
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Sample documents
documents = [
    'This is sample document for testing the text similarity',
    'we will use nltk for computing similarity of text',
    'NLTK is powerful library for natural language processing'
]

# Preprocessing: tokenization, lowercase, stopword removal, stemming
def preprocess_and_stem(text):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    return [
        stemmer.stem(word.lower())
        for word in tokens
        if word.isalpha() and word.lower() not in stop_words
    ]

# Apply preprocessing to all documents
preprocessed_docs = [" ".join(preprocess_and_stem(doc)) for doc in documents]

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_docs)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("‚úÖ TF-IDF Matrix:")
print(tfidf_df)

# Cosine similarity calculation
cosine_sim_matrix = cosine_similarity(tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=range(1, 4), columns=range(1, 4))

print("\nüîç Cosine Similarity Matrix:")
print(cosine_sim_df)


[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


‚úÖ TF-IDF Matrix:
    comput  document   languag   librari     natur      nltk     power  \
0  0.00000  0.490479  0.000000  0.000000  0.000000  0.000000  0.000000   
1  0.51742  0.000000  0.000000  0.000000  0.000000  0.393511  0.000000   
2  0.00000  0.000000  0.423394  0.423394  0.423394  0.322002  0.423394   

    process     sampl   similar      test      text      use  
0  0.000000  0.490479  0.373022  0.490479  0.373022  0.00000  
1  0.000000  0.000000  0.393511  0.000000  0.393511  0.51742  
2  0.423394  0.000000  0.000000  0.000000  0.000000  0.00000  

üîç Cosine Similarity Matrix:
          1         2         3
1  1.000000  0.293577  0.000000
2  0.293577  1.000000  0.126712
3  0.000000  0.126712  1.000000


In [None]:
from sklearn.metrics import jaccard_score
import numpy as np

# Convert tokenized documents to binary bag-of-words representation
def binary_bag_of_words(documents, vocabulary):
    """Convert tokenized documents into binary vectors based on a given vocabulary."""
    return np.array([[1 if word in doc else 0 for word in vocabulary] for doc in documents])

# Get unique words across all documents (vocabulary)
unique_words = set(word for doc in preprocessed_doc for word in doc)

# Convert documents to binary bag-of-words format
binary_matrix = binary_bag_of_words(preprocessed_doc, unique_words)

# Compute Jaccard Similarity for each pair of documents
num_docs = len(preprocessed_doc)
jaccard_sim_matrix = np.zeros((num_docs, num_docs))

for i in range(num_docs):
    for j in range(num_docs):
        jaccard_sim_matrix[i][j] = jaccard_score(binary_matrix[i], binary_matrix[j])

# Convert to a DataFrame for better readability
jaccard_sim_df = pd.DataFrame(
    jaccard_sim_matrix,
    index=range(1, num_docs + 1),
    columns=range(1, num_docs + 1)
)

# Display the Jaccard Similarity Matrix
print("Jaccard Similarity Matrix:")
print(jaccard_sim_df)


Jaccard Similarity Matrix:
      1     2    3
1  1.00  0.25  0.0
2  0.25  1.00  0.1
3  0.00  0.10  1.0
