In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity


corpus = ["the cat sat on the mat", "the dog sat on the rug"]

tokens = [sentence.split() for sentence in corpus]

window_size = 2

vocab = set(word for sentence in tokens for word in sentence)
vocab = list(vocab)
co_occurrence_matrix = np.zeros((len(vocab), len(vocab)), dtype=int)

word_to_index = {word: i for i, word in enumerate(vocab)}

for sentence in tokens:
    for i, word in enumerate(sentence):
        start = max(i - window_size, 0)
        end = min(i + window_size + 1, len(sentence))
        window_words = sentence[start:end]
        for window_word in window_words:
            if window_word != word:
                co_occurrence_matrix[word_to_index[word], word_to_index[window_word]] += 1

co_occurrence_df = pd.DataFrame(co_occurrence_matrix, index=vocab, columns=vocab)

print("Co-occurrence Matrix:")
print(co_occurrence_df)

similarity_matrix = cosine_similarity(co_occurrence_matrix)

print("Cosine Similarity Matrix:")
print(pd.DataFrame(similarity_matrix, index=vocab, columns=vocab))

Co-occurrence Matrix:
     the  rug  cat  sat  on  mat  dog
the    0    1    1    4   2    1    1
rug    1    0    0    0   1    0    0
cat    1    0    0    1   1    0    0
sat    4    0    1    0   2    0    1
on     2    1    1    2   0    1    1
mat    1    0    0    0   1    0    0
dog    1    0    0    1   1    0    0
Cosine Similarity Matrix:
          the       rug       cat       sat        on       mat       dog
the  1.000000  0.288675  0.707107  0.261116  0.707107  0.288675  0.707107
rug  0.288675  1.000000  0.816497  0.904534  0.408248  1.000000  0.816497
cat  0.707107  0.816497  1.000000  0.738549  0.666667  0.816497  1.000000
sat  0.261116  0.904534  0.738549  1.000000  0.615457  0.904534  0.738549
on   0.707107  0.408248  0.666667  0.615457  1.000000  0.408248  0.666667
mat  0.288675  1.000000  0.816497  0.904534  0.408248  1.000000  0.816497
dog  0.707107  0.816497  1.000000  0.738549  0.666667  0.816497  1.000000


Co-occurrence Matrix:
     on  mat  dog  the  sat  cat  rug
on    0    1    1    2    2    1    1
mat   1    0    0    1    0    0    0
dog   1    0    0    1    1    0    0
the   2    1    1    0    4    1    1
sat   2    0    1    4    0    1    0
cat   1    0    0    1    1    0    0
rug   1    0    0    1    0    0    0
Cosine Similarity Matrix:
           on       mat       dog       the       sat       cat       rug
on   1.000000  0.408248  0.666667  0.707107  0.615457  0.666667  0.408248
mat  0.408248  1.000000  0.816497  0.288675  0.904534  0.816497  1.000000
dog  0.666667  0.816497  1.000000  0.707107  0.738549  1.000000  0.816497
the  0.707107  0.288675  0.707107  1.000000  0.261116  0.707107  0.288675
sat  0.615457  0.904534  0.738549  0.261116  1.000000  0.738549  0.904534
cat  0.666667  0.816497  1.000000  0.707107  0.738549  1.000000  0.816497
rug  0.408248  1.000000  0.816497  0.288675  0.904534  0.816497  1.000000
