-
Notifications
You must be signed in to change notification settings - Fork 0
/
doc2vec_main_yelp.py
executable file
·130 lines (108 loc) · 3.86 KB
/
doc2vec_main_yelp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 10 11:30:58 2016
@author: SaeromPark
"""
#import gensim
#from gensim.models.doc2vec import *
import os,sys
cwd = os.getcwd()
from doc2vec_revised import *
import word2vec
import pickle
import numpy as np
import random
import re
import copy
import nltk
import pandas as pd
from IPython import embed
data_path = cwd + '/data'
year = '2013'
"""
f = open(data_path+'/amazon_data/kitchen_processed_unlabeled.txt','rb')
texts = pickle.load(f)
f.close()
f = open(data_path+'/amazon_data/kitchen_processed_unlabeled_label.txt','rb')
labels = pickle.load(f)
f.close()
"""
f = open('../data/yelp_reviews_text_'+year,'rb')
texts = pickle.load(f)
f.close()
f = open('../data/yelp_reviews_label_'+year+'.pickle','rb')
labels = pickle.load(f)
f.close()
texts = np.array(texts)
labels = np.array(labels)
print(year)
aa = random.shuffle(list(range(labels.shape[0])))
texts = texts[aa]
labels = np.array(labels[aa])
nega_idx = labels<3
posi_idx = labels>3
labels[nega_idx] = -1
labels[posi_idx] = 1
nega_text = list(texts[nega_idx])
posi_text = list(texts[posi_idx])
len_text = min(len(nega_text),len(posi_text))
semi_ratio = [0.3,0.4,0.5,0.6,0.7] # the ratio of unlabeled data is 0.3
rr = semi_ratio[4]
unl_num = round(rr*len_text)
print("yelp ",year)
print("The ratio of data: ",rr)
print("The number of unlabeled data: ",unl_num)
text = posi_text[unl_num:len_text] + nega_text[unl_num:len_text] + posi_text[:unl_num]+ nega_text[:unl_num]
label = list(labels[posi_idx][unl_num:len_text]) + list(labels[nega_idx][unl_num:len_text]) + list(labels[posi_idx][:unl_num])+list(labels[nega_idx][:unl_num])
print("The number of documents: ",len(label))
#text = posi_text + nega_text + unla_text[:1000]
del texts
documents = []
for document in text:
temp = document.replace('!','.').replace('?','.').replace(';','.').replace(':','.').replace('\n',' ').strip()
documents.append(temp.split('.'))
# we need to make the setences as lower cases or only letters!
del text
true_labels = label
doc_labels = np.array(label,copy = True)
doc_labels[:-2*unl_num] = 0
#doc_labels = list(doc_labels)
labels = {'train_label':doc_labels,'true_label':true_labels}
labels = pd.DataFrame(labels)
"""
f = open('../data/yelp'+year+'_semi_ratio_'+str(rr)+'_label.txt','wb')
pickle.dump(labels,f)
f.close()
"""
sentences = []
for uid, doc in enumerate(documents):
for sen in doc:
sen.lower()
sen = re.sub("[^a-zA-Z]"," ", sen)
if uid < len_text-unl_num:
sentence = TaggedDocument(words = sen.split(),tags = ['POS_%s' %uid])
elif uid <2*(len_text-unl_num):
sentence = TaggedDocument(words = sen.split(),tags = ['NEG_%s' %(uid-len_text+unl_num)])
else:
sentence = TaggedDocument(words = sen.split(),tags = ['UNLA_%s' %(uid-2*(len_text-unl_num))])
sentences.append(sentence)
print("length of sentences = ",len(sentences))
print("start to training")
del documents
d_size = 200
#beta = [0.1]
beta = [0.01,0.02,0.03,0.05,0.1]
for be in beta:
#embed()
print('beta = ',be)
model_neighbor = Doc2Vec(sentences,doc_labels,beta = be,learn_unlabel = 0, size=d_size, window = 3, min_count=3, workers = 8, dbow_neighbor = 1,iter = 10)
file_name = '../results/yelp_'+year+'_doc2vec_dbow_neighbor_'+str(rr)+'_beta_'+str(be)+ '.doc2vec'
model_neighbor.save(file_name)
doctag = list(model_neighbor.docvecs.doctag_syn0)
doc2vec = {'train_label':doc_labels,'true_label':true_labels,'docvec':doctag}
doc2vec = pd.DataFrame(doc2vec)
f = open('../results/yelp_'+year+'_doc2vec_dbow_neighbor_'+str(rr)+'_beta_'+str(be)+'_data.pickle','wb')
pickle.dump(doc2vec,f)
f.close()
#embed()
#model_neighbor.save('results/doc2vec_dbow_neighbor_yelp_test_beta0.025.doc2vec')