In [62]:
from gensim.models import Word2Vec
import nltk 

# Assuming `stem_tokens` and `non_stem_tokens` are lists of lists of tokens
# For example, stem_tokens = [["word1_doc1", "word2_doc1", ...], ["word1_doc2", ...], ...]

stem_documents = [
    "STEM Student 1.docx", "STEM Student 2.docx", "STEM Student 3.docx",
    "STEM Student 4.docx", "STEM Student 5.docx", "STEM Student 6.docx",
    "STEM Student 7.docx", "STEM Student 8.docx", "STEM Student 9.docx",
    "STEM Student 10.docx", "STEM Student 11.docx", "STEM Student 12.docx",
    "STEM Student 13.docx", "STEM Student 14.docx", "STEM Student 15.docx",
    "STEM Student 16.docx", "STEM Student 17.docx", "STEM Student 18.docx",
    "STEM Student 19.docx", "STEM Student 20.docx"
]

non_stem_documents = [
    "non-STEM Student 1.docx", "non-STEM Student 2.docx", "non-STEM Student 3.docx",
    "non-STEM Student 4.docx", "non-STEM Student 5.docx", "non-STEM Student 6.docx",
    "non-STEM Student 7.docx", "non-STEM Student 8.docx", "non-STEM Student 9.docx",
    "non-STEM Student 10.docx", "non-STEM Student 11.docx", "non-STEM Student 12.docx",
    "non-STEM Student 13.docx", "non-STEM Student 14.docx", "non-STEM Student 15.docx",
    "non-STEM Student 16.docx", "non-STEM Student 17.docx", "non-STEM Student 18.docx",
    "non-STEM Student 19.docx"
]


# Tokenization and preprocessing
stem_documents_tokenized = [nltk.word_tokenize(doc.lower()) for doc in stem_documents]
non_stem_documents_tokenized = [nltk.word_tokenize(doc.lower()) for doc in non_stem_documents]

# Train a Word2Vec model on all documents or use pre-trained
model = Word2Vec(stem_documents_tokenized + non_stem_documents_tokenized, vector_size=100, window=5, min_count=1, workers=4)


In [63]:
# from gensim.models import Word2Vec

# from docx import Document

# def read_docx(file_path):
#     doc = Document(file_path)
#     return " ".join([paragraph.text for paragraph in doc.paragraphs])

# # Example usage
# stem_documents = [read_docx(file_path) for file_path in stem_document_paths]
# non_stem_documents = [read_docx(file_path) for file_path in non_stem_document_paths]


In [64]:
import numpy as np

def document_vector(doc, model):
    # Remove out-of-vocabulary words
    doc = [word for word in doc if word in model.wv]
    if not doc:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[doc], axis=0)

stem_doc_vectors = [document_vector(doc, model) for doc in stem_documents_tokenized]
non_stem_doc_vectors = [document_vector(doc, model) for doc in non_stem_documents_tokenized]


In [65]:
from sklearn.metrics.pairwise import cosine_similarity

# Convert lists of vectors into matrices for cosine similarity calculation
stem_vector_matrix = np.array(stem_doc_vectors)
non_stem_vector_matrix = np.array(non_stem_doc_vectors)

# Cosine similarity within STEM documents
stem_cosine_sim = cosine_similarity(stem_vector_matrix)

# Cosine similarity within non-STEM documents
non_stem_cosine_sim = cosine_similarity(non_stem_vector_matrix)

# Cosine similarity between STEM and non-STEM documents
stem_non_stem_cosine_sim = cosine_similarity(stem_vector_matrix, non_stem_vector_matrix)


In [66]:
stem_labels = [f"STEM_Doc_{i+1}" for i in range(60)]  # STEM document labels
non_stem_labels = [f"Non-STEM_Doc_{i+1}" for i in range(60)]  # Non-STEM document labels


In [67]:
def print_similarity_scores(cosine_sim_matrix, labels1, labels2=None):
    if labels2 is None:
        labels2 = labels1
    for i, label1 in enumerate(labels1):
        for j, label2 in enumerate(labels2):
            if i < j:  # For within-group similarity, print each pair once
                print(f"Similarity between {label1} and {label2}: {cosine_sim_matrix[i, j]:.4f}")
            elif labels1 != labels2:  # For between-group, print all
                print(f"Similarity between {label1} and {label2}: {cosine_sim_matrix[i, j-len(labels1)]:.4f}")
        if i == 19: 
            break

# Print similarity within STEM documents
print("Similarity within STEM Documents:")
print_similarity_scores(stem_cosine_sim, stem_documents_tokenized)

# Print similarity within Non-STEM documents
print("\nSimilarity within Non-STEM Documents:")
print_similarity_scores(non_stem_cosine_sim, non_stem_documents_tokenized)

# # Print similarity between STEM and Non-STEM documents
print("\nSimilarity between STEM and Non-STEM Documents:")
print_similarity_scores(stem_non_stem_cosine_sim, stem_documents_tokenized, non_stem_documents_tokenized)





Similarity within STEM Documents:
Similarity between ['stem', 'student', '1.docx'] and ['stem', 'student', '2.docx']: 0.6630
Similarity between ['stem', 'student', '1.docx'] and ['stem', 'student', '3.docx']: 0.6720
Similarity between ['stem', 'student', '1.docx'] and ['stem', 'student', '4.docx']: 0.6790
Similarity between ['stem', 'student', '1.docx'] and ['stem', 'student', '5.docx']: 0.6689
Similarity between ['stem', 'student', '1.docx'] and ['stem', 'student', '6.docx']: 0.6535
Similarity between ['stem', 'student', '1.docx'] and ['stem', 'student', '7.docx']: 0.6659
Similarity between ['stem', 'student', '1.docx'] and ['stem', 'student', '8.docx']: 0.6704
Similarity between ['stem', 'student', '1.docx'] and ['stem', 'student', '9.docx']: 0.6729
Similarity between ['stem', 'student', '1.docx'] and ['stem', 'student', '10.docx']: 0.6995
Similarity between ['stem', 'student', '1.docx'] and ['stem', 'student', '11.docx']: 0.6161
Similarity between ['stem', 'student', '1.docx'] and [

IndexError: index -20 is out of bounds for axis 1 with size 19

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Example: Visualize STEM document similarity
plt.figure(figsize=(10, 8))
sns.heatmap(stem_cosine_sim, annot=True, fmt=".2f", xticklabels=stem_labels, yticklabels=stem_labels, cmap='coolwarm')
plt.title("Cosine Similarity among STEM Documents")
plt.show()
