-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_word2vec.py
37 lines (30 loc) · 1.2 KB
/
generate_word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from gensim.models.word2vec import Word2Vec
from collections import Counter
import os
from glob import glob
from zipfile import ZipFile
import sys
import _pickle as pickle
all_sentences=[]
list_of_dirs = os.listdir('/scratch/bsg348-share/MLCS/data-new/clean_Mar_20')
for directory in list_of_dirs:
if not directory.endswith('zip') and not directory.startswith('.'):
files=os.listdir('/scratch/bsg348-share/MLCS/data-new/clean_Mar_20/'+directory+'/maj')
print(directory)
for file_name in files:
new_file_name='/scratch/bsg348-share/MLCS/data-new/clean_Mar_20/'+directory+'/maj/'+file_name
with open(new_file_name, mode='rb') as f_obj:
test = pickle.load(f_obj)
for para in test:
para=para.strip()
texts = [w for w in para.lower().split(" ")]
all_sentences.append(texts)
print('training word2vec started')
#training word2vec, size=the final word2vec length, min_count= the threshold count below which words should be ignored, worker= parallel training
model = Word2Vec(all_sentences, size=100, window=5, min_count=5, workers=4)
model.save("word2vec_model_cleaned_data")
'''
Usage:
model=Word2Vec.load('word2vec_model_cleaned_data')
model.similarity('man','woman')
'''