In [25]:
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

In [26]:
def preprocess_text(text):
    # Remove special characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
    return text

In [27]:
def remove_stopwords(text):
    # Split text into individual words
    words = text.split()
    
    # Get the set of English stopwords
    english_stopwords = set(stopwords.words('english'))
    
    # Remove stopwords
    words = [word for word in words if word not in english_stopwords]
    
    # Join the remaining words back into a single string
    processed_text = ' '.join(words)
    
    return processed_text

In [37]:
def calculate_similarity(text1, text2, ngram_range=(1, 2)):
    # Preprocess the texts
    text1 = preprocess_text(text1)
    text2 = preprocess_text(text2)
    
    # Combine the preprocessed texts into a list
    texts = [text1, text2]
    
    # Remove stopwords
    texts = [remove_stopwords(text) for text in texts]
    
    # Create an instance of TfidfVectorizer with n-gram support
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    
    # Fit and transform the texts to obtain the TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    # Calculate the cosine similarity between the TF-IDF vectors
    similarity_matrix = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    
    return similarity_matrix[0][0]

In [47]:
def do_analyze_texts(pText1, pText2, pText3, pNgram=1):

    text1 = pText1
    text2 = pText2
    text3 = pText3

    # Tokenize and preprocess the texts
    tokenizer = nltk.tokenize.WordPunctTokenizer()
    tokens1 = tokenizer.tokenize(text1.lower())
    tokens2 = tokenizer.tokenize(text2.lower())
    tokens3 = tokenizer.tokenize(text3.lower())

    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2, text3])

    # Calculate cosine similarity between text1 and text2
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    print(f"Similarity between text1 and text2: {similarity[0][0]}")

    # Calculate cosine similarity between text1 and text3
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[2])
    print(f"Similarity between text1 and text3: {similarity[0][0]}")

    print("similarity ngram t1 and t2")
    similarity = calculate_similarity(text1, text2, ngram_range=(1, pNgram))
    print(similarity)
    
    print("similarity ngram t1 and t3")
    similarity = calculate_similarity(text1, text3, ngram_range=(1, pNgram))
    print(similarity)

    return tokens1, tokens2, tokens3


In [58]:
# Sample texts
text1 = "The old lazy cat is on the mat"
text2 = "The cat is not on the big old dirty mat"
text3 = "The big fucking cat and dog are friends"
tokens1, tokens2, tokens3 = do_analyze_texts (text1, text2, text3)
print(tokens1)
print(tokens2) 
print(tokens3)


Similarity between text1 and text2: 0.7004030841944571
Similarity between text1 and text3: 0.1857421473897179
similarity ngram t1 and t2
0.5101490193104813
similarity ngram t1 and t3
0.1273595297947935
['the', 'old', 'lazy', 'cat', 'is', 'on', 'the', 'mat']
['the', 'cat', 'is', 'not', 'on', 'the', 'big', 'old', 'dirty', 'mat']
['the', 'big', 'fucking', 'cat', 'and', 'dog', 'are', 'friends']


In [62]:
# Sample texts
text1 = "The cat is on the mat"
text2 = "The cat is not on the mat"
text3 = "The cat and dog are friends"
tokens1, tokens2, tokens3 = do_analyze_texts (text1, text2, text3, pNgram=4)
print(tokens1)
print(tokens2) 
print(tokens3)


Similarity between text1 and text2: 0.8813356859409095
Similarity between text1 and text3: 0.2588470024866933
similarity ngram t1 and t2
1.0000000000000002
similarity ngram t1 and t3
0.13627634143908643
['the', 'cat', 'is', 'on', 'the', 'mat']
['the', 'cat', 'is', 'not', 'on', 'the', 'mat']
['the', 'cat', 'and', 'dog', 'are', 'friends']


In [55]:
pre_proc_1 = remove_stopwords(preprocess_text(text1))
pre_proc_2 = remove_stopwords(preprocess_text(text2))
pre_proc_3 = remove_stopwords(preprocess_text(text3))
tokens1, tokens2, tokens3 = do_analyze_texts (pre_proc_1, pre_proc_2, pre_proc_3)

print(tokens1)
print(tokens2) 
print(tokens3)

Similarity between text1 and text2: 1.0
Similarity between text1 and text3: 0.23636982151884145
similarity ngram t1 and t2
1.0000000000000002
similarity ngram t1 and t3
0.26055567105626243
['cat', 'mat']
['cat', 'mat']
['cat', 'dog', 'friends']


In [57]:
pre_proc_1 = remove_stopwords(preprocess_text(text1))
pre_proc_2 = remove_stopwords(preprocess_text(text2))
pre_proc_3 = remove_stopwords(preprocess_text(text3))
tokens1, tokens2, tokens3 = do_analyze_texts (pre_proc_1, pre_proc_2, pre_proc_3, pNgram=3)

print(tokens1)
print(tokens2) 
print(tokens3)

Similarity between text1 and text2: 1.0
Similarity between text1 and text3: 0.23636982151884145
similarity ngram t1 and t2
1.0000000000000002
similarity ngram t1 and t3
0.13627634143908643
['cat', 'mat']
['cat', 'mat']
['cat', 'dog', 'friends']


In [13]:
tokens1

['the', 'cat', 'is', 'on', 'the', 'mat']

In [14]:
tokens2

['the', 'cat', 'is', 'not', 'on', 'the', 'mat']

In [18]:
tfidf_matrix

<3x10 sparse matrix of type '<class 'numpy.float64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [6]:
!{sys.executable} -m pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.4/26.4 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting smart-open>=1.8.1
  Downloading smart_open-6.3.0-py3-none-any.whl (56 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m313.8 kB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.3.1 smart-open-6.3.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
import nltk
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [12]:


# Sample texts
text1 = "The cat is on the mat"
text2 = "The cat is not on the mat"

# Tokenize the texts
tokenizer = nltk.tokenize.WordPunctTokenizer()
tokens1 = tokenizer.tokenize(text1.lower())
tokens2 = tokenizer.tokenize(text2.lower())

# Build vocabulary
sentences = [tokens1, tokens2]

# Train Word2Vec model
model = Word2Vec(sentences, min_count=1, vector_size=100)

# Calculate cosine similarity between text1 and text2 using Word2Vec embeddings
similarity = model.wv.similarity('not', 'on')  # Negated word: 'not'
print(f"Similarity between text1 and text2: {similarity}")

Similarity between text1 and text2: 0.06408979743719101
