In [4]:
import docx
import re
import os
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import numpy as np

# Setup NLTK stopwords
stop_words = set(stopwords.words('english'))

def read_docx(file_path):
    """
    Reads a .docx file and returns its text content.
    """
    doc = docx.Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

def preprocess_text(text):
    """
    Cleans and preprocesses the input text.
    """
    duration_index = text.find("Duration:")
    if duration_index != -1:
        text = text[duration_index:]

    text = re.sub(r'\b\d{1,2}:\d{2}:\d{2}\b', '', text)
    text = re.sub(r'\b[I|R]:\b', '', text)

    cleaned_sentences = []
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        tokens = simple_preprocess(sentence, deacc=True)
        filtered_tokens = [word for word in tokens if word not in stop_words]
        cleaned_sentences.append(filtered_tokens)
    
    return cleaned_sentences

def process_directory(directory_path):
    """
    Process all .docx files in the given directory, skipping temporary files,
    and return cleaned sentences.
    """
    all_cleaned_sentences = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".docx") and not filename.startswith("~$"):
            file_path = os.path.join(directory_path, filename)
            text = read_docx(file_path)
            preprocessed_text = preprocess_text(text)
            all_cleaned_sentences.extend(preprocessed_text)
    return all_cleaned_sentences

# Directory containing your .docx files
directory_path = '/Users/sunmoon/Desktop/interviews'

# Process the directory and get all cleaned sentences
all_cleaned_sentences = process_directory(directory_path)

# Train a Word2Vec model with the cleaned sentences
model = Word2Vec(all_cleaned_sentences, vector_size=100, window=5, min_count=2, workers=4)

# Correctly specify the model save path and save the model
model_save_path = '/Users/sunmoon/Desktop/word2vec_model.model'
model.save(model_save_path)

# Example of how to use the model, e.g., finding most similar words
similar_words = model.wv.most_similar('title', topn=10)
print(similar_words)

result_conditional = model.wv.most_similar(positive=['wife', 'title'], negative=['husband'], topn=1)
print(result_conditional)
result = model.wv.most_similar(positive=['husband', 'title'], negative=['wife'], topn=10)
print(result)

[nltk_data] Downloading package punkt to /Users/sunmoon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sunmoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('names', 0.8297750353813171), ('could', 0.7967355847358704), ('tittle', 0.7862040400505066), ('signed', 0.7795634865760803), ('refused', 0.7756777405738831), ('name', 0.768436849117279), ('purpose', 0.7683154940605164), ('papers', 0.7647914290428162), ('automatic', 0.754810631275177), ('accept', 0.7531217932701111)]
[('sign', 0.7202320098876953)]
[('government', 0.6904069781303406), ('worried', 0.6890232563018799), ('know', 0.6562948822975159), ('saying', 0.6535388827323914), ('stolen', 0.6529539227485657), ('fake', 0.6437405347824097), ('chance', 0.63899827003479), ('nobody', 0.6382966041564941), ('scared', 0.6379045844078064), ('push', 0.6355583071708679)]


In [16]:
def document_vector(doc):
    """Create a vector for a document by averaging all word vectors in the document."""
    words = [word for word in doc if word in model.wv.key_to_index]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    else:
        return []

# Example usage
doc_vectors = [document_vector(doc) for doc in all_cleaned_sentences]

from scipy import spatial

def cosine_similarity(vec1, vec2):
    """Calculate the cosine similarity between two vectors."""
    return 1 - spatial.distance.cosine(vec1, vec2)

# Example: Calculate similarity between the first and second documents
similarity = cosine_similarity(doc_vectors[0], doc_vectors[1])
print(f"Similarity between document 1 and document 2: {similarity}")


Similarity between document 1 and document 2: 0.5571809717740717
