In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:

# Load the preprocessed documents from the pandas dataframe
df = pd.read_csv('alldoc_processed.csv')

# Tokenize the documents
documents = df['text2'].apply(lambda x: x.split())

# Calculate the TF-IDF scores for each word
tfidf = TfidfVectorizer()
tfidf.fit(df['text2'])
tfidf_scores = tfidf.transform(df['text2']).toarray()

# Define the parameters for the Word2Vec model
vector_size = 200
window_size = 5
min_count = 1
epochs = 100

# Train the Word2Vec model on the tokenized documents
model = Word2Vec(documents, vector_size=vector_size, window=window_size, min_count=min_count, epochs=epochs)

# Create a dictionary of word to index mapping
word2idx = {word: idx for idx, word in enumerate(model.wv.index_to_key)}

# Create an empty matrix to store the weighted word vectors
word_vectors = np.zeros((len(word2idx), vector_size))

# Calculate the weighted word vectors
for word in word2idx.keys():
    idx = word2idx[word]
    tfidf_score = tfidf.vocabulary_.get(word, 0)
    word_vector = model.wv[word]
    weighted_word_vector = word_vector * tfidf_score
    word_vectors[idx] = weighted_word_vector

# Save the word vectors to a CSV file
df_word_vectors = pd.DataFrame(word_vectors, index=word2idx.keys())
df_word_vectors.to_csv('tfidf_weighted_word_vectors.csv')


In [5]:
 df_word_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
section,-405068.7639,402403.442067,-169183.236705,368462.410691,-339972.860615,-223949.440353,561734.26972,167815.397498,134191.401599,37283.403247,...,567345.616958,-789390.870801,135497.140213,581779.084395,268858.175725,241707.902475,-294817.418549,-222935.121415,-212377.770571,-23312.620316
act,-40897.761719,10975.735352,3354.517822,17156.941406,-21851.822266,-18822.019531,54324.941406,62226.785156,-9434.180664,18313.675781,...,11679.283203,-47052.714844,2565.539795,5155.882324,26838.494141,-18553.822266,-24485.322266,-35318.546875,-19155.75,-34392.617188
clause,-52295.632812,-75039.8125,-105058.515625,-1408.360107,-12863.462891,126662.695312,38717.785156,85363.289062,69568.421875,-17281.40625,...,-84457.601562,55445.683594,-46506.035156,68641.65625,32794.351562,-103982.15625,-6232.87793,4570.759766,23101.347656,-32138.695312
case,30813.597656,-81785.085938,26999.462891,86575.25,41625.484375,38154.246094,89857.898438,-10804.091797,-59168.472656,-80045.9375,...,-5171.226562,-24970.029297,-18368.898438,81909.328125,-30888.158203,-11807.948242,-29847.513672,8491.957031,28239.064453,11755.750977
order,-33614.767922,434742.004047,638564.683419,-70808.968698,113318.181577,-167250.704972,2092.799376,-267278.510793,-84420.737242,202132.169201,...,-41205.838491,-3918.790109,-212551.710623,104518.116443,-110942.816133,-252126.682446,9984.058084,245545.500047,-47335.012268,124523.975307


In [10]:
w2v = pd.read_csv('tfidf_weighted_word_vectors.csv',index_col=0)

In [11]:
w2v.loc['section',:]

0     -405068.763900
1      402403.442067
2     -169183.236705
3      368462.410691
4     -339972.860615
           ...      
195    241707.902475
196   -294817.418549
197   -222935.121415
198   -212377.770571
199    -23312.620316
Name: section, Length: 200, dtype: float64