In [45]:
# 1. Import Statements
import docx
import re
import os
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
import numpy as np


# 2. Variable and Function Definitions

# Setup NLTK stopwords
stop_words = set(stopwords.words('english'))

def read_docx(file_path):
    """ Reads a .docx file and returns its text content. """
    doc = docx.Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

def preprocess_text(text):
    """ Cleans and preprocesses the input text. """
    duration_index = text.find("Duration:")
    if duration_index != -1:
        text = text[duration_index:]
    text = re.sub(r'\b\d{1,2}:\d{2}:\d{2}\b', '', text)
    text = re.sub(r'\b[I|R]:\b', '', text)
    cleaned_sentences = []
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        tokens = simple_preprocess(sentence, deacc=True)
        filtered_tokens = [word for word in tokens if word not in stop_words]
        cleaned_sentences.append(filtered_tokens)
    return cleaned_sentences

def process_directory(directory_path):
    """ Processes all .docx files in the given directory, skipping temporary files. """
    all_cleaned_sentences = []
    doc_names = []  # List to store document names
    for filename in os.listdir(directory_path):
        if filename.endswith(".docx") and not filename.startswith("~$"):
            file_path = os.path.join(directory_path, filename)
            text = read_docx(file_path)
            preprocessed_text = preprocess_text(text)
            all_cleaned_sentences.extend(preprocessed_text)
            doc_names.append(filename)  # Store the document name
    return all_cleaned_sentences, doc_names

# 3. Execution of Main Process

# Directory containing your .docx files
directory_path = '/Users/sunmoon/Desktop/interviews'

# Process the directory and get all cleaned sentences and document names
all_cleaned_sentences, doc_names = process_directory(directory_path)

# Train a Word2Vec model with the cleaned sentences
model = Word2Vec(all_cleaned_sentences, vector_size=100, window=5, min_count=2, workers=4)

# Save the model
model_save_path = '/Users/sunmoon/Desktop/word2vec_model.model'
model.save(model_save_path)

# Utilize the model
result_conditional = model.wv.most_similar(positive=['wife', 'title'], negative=['man'], topn=1)
print(result_conditional)

result = model.wv.most_similar(positive=['wife', 'land'], negative=['husband'], topn=10)
print(result)
result1 = model.wv.most_similar(positive=['man','title'], negative=['power'], topn=10)
print(result1)

result2 = model.wv.most_similar(positive=['wife','condition'], negative=['husband'], topn=10)
print(result2)

result3 = model.wv.most_similar(positive=['wife','power'], negative=['husband'], topn=10)
print(result3)

result3 = model.wv.most_similar(positive=['husband','power'], negative=['wife'], topn=10)
print(result3)
result4 = model.wv.most_similar(positive=['husband','title'], negative=['wife'], topn=10)
print(result4)

result5 = model.wv.most_similar(positive=['husband','land'], negative=['wife'], topn=10)
print(result5)

[('signed', 0.7266936898231506)]
[('consult', 0.7582036256790161), ('need', 0.7334017753601074), ('approval', 0.7002829909324646), ('sign', 0.696032702922821), ('buyer', 0.6958451867103577), ('seller', 0.6936970353126526), ('without', 0.6869668364524841), ('agreement', 0.6738719344139099), ('signature', 0.6664913296699524), ('permission', 0.6537427306175232)]
[('name', 0.7067655324935913), ('names', 0.6978930830955505), ('include', 0.6301841735839844), ('included', 0.6283405423164368), ('wife', 0.6223018169403076), ('sign', 0.6200464367866516), ('reason', 0.6187673807144165), ('signature', 0.613850474357605), ('woman', 0.6068125367164612), ('agreement', 0.5849668979644775)]
[('agreed', 0.8761587142944336), ('discuss', 0.8604610562324524), ('signing', 0.8531066179275513), ('signs', 0.847666323184967), ('signatures', 0.8463988304138184), ('discussed', 0.8458548188209534), ('papers', 0.8434476256370544), ('accept', 0.843253493309021), ('offer', 0.8392332792282104), ('putting', 0.837442040

In [9]:
def document_vector(doc):
    """Create a vector for a document by averaging all word vectors in the document."""
    words = [word for word in doc if word in model.wv.key_to_index]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    else:
        return []

# Example usage
doc_vectors = [document_vector(doc) for doc in all_cleaned_sentences]

from scipy import spatial

def cosine_similarity(vec1, vec2):
    """Calculate the cosine similarity between two vectors."""
    return 1 - spatial.distance.cosine(vec1, vec2)

# Example: Calculate similarity between the first and second documents
similarity = cosine_similarity(doc_vectors[0], doc_vectors[1])
print(f"Similarity between document 1 and document 2: {similarity}")


Similarity between document 1 and document 2: 0.5829067739400201
{'5610031-FA-11072019.docx': array([ 0.06808852,  0.3739277 , -0.09613427,  0.07724917, -0.07074348,
       -0.6362215 ,  0.1344228 ,  0.98651046, -0.24355337,  0.06344256,
       -0.02118757, -0.3730866 , -0.07297059,  0.00403231,  0.06464345,
       -0.07839628,  0.01793848, -0.38785124, -0.39976645, -0.24141943,
        0.04320994,  0.21240267,  0.09645537, -0.47431797,  0.05271796,
        0.31824178, -0.03588119, -0.30342358, -0.4852934 , -0.07240617,
        0.13151558, -0.3687325 ,  0.21095905,  0.3600325 , -0.20343927,
        0.37332317, -0.01724249, -0.05562028, -0.38682687, -0.6085618 ,
       -0.25499982, -0.49554667, -0.01311465,  0.2213327 , -0.06852473,
       -0.46222907, -0.44391903, -0.2657531 , -0.31688222, -0.08440889,
       -0.1176305 , -0.44969052,  0.06463506, -0.2960095 , -0.37443608,
        0.16316113,  0.31304145, -0.19315526, -0.36550707,  0.18223454,
        0.04111025, -0.11232986, -0.427046