-
Notifications
You must be signed in to change notification settings - Fork 3
/
fasion_dataset.py
67 lines (61 loc) · 2.88 KB
/
fasion_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
from PIL import Image
import numpy as np
from stop_words import *
from tqdm import tqdm
from train_model import text_w2model
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
import multiprocessing
path = r'E:\数据集\Fashion-200k' # 数据集路径
label_path = "labels/labels" # 标签路径
def Fashion_200k(path, label_path, split="train"):
all_label_path = os.path.join(path, label_path)
all_name = os.listdir(all_label_path)
all_words = [] # 所有文本的词
all_txt = [] # 文本数据
label_data = [] # 标签
i = 0 # 总类 = 5
all_img_path = [] # 所有图像的路径
max_len = 0 # 训练集最大 = 13 ,测试集最大 = 12
count = 1 # 计数用
for name in all_name:
if name.split("_")[1] == split: # dress_train_detect_all
print(f"正在操作{name}...")
with open(os.path.join(all_label_path, name), "r", encoding='utf-8') as f:
contents = f.readlines()
for content in contents:
print(f"正在读取Fasion-200k 第 {count} 行...")
value = content.split()
tmp = [] # 需要先初始化,保存每一段去停用词后的文本
# 先去停用词后将所有词保存至列表,并保存文本序列
txt = " ".join(value[2:]) # 字符串 # .gray's delaney crochet sleeve dress
for j in symbol:
txt = txt.replace(j, " ") # 干净的字符串
txt_list = txt.split(" ") # ['', 'gray', 's', 'delaney', 'crochet', 'sleeve', 'dress']
for word in txt_list: # 这里需要保证读取顺序
if word not in stop_words:
tmp.append(word)
all_words.append(tmp) # 分词后的每一段文本一个列表,等待word2vec转换
all_txt.append(" ".join(tmp))
label_data.append(i)
all_img_path.append(os.path.join(path, value[0]))
count += 1
i += 1
return all_img_path, all_txt, label_data, all_words
if __name__ == "__main__":
cpu_count = multiprocessing.cpu_count()
img_path, txt_data, label, all_words = Fashion_200k(path, label_path)
# model = Word2Vec(size=500, # 建立一个空的模型对象,设置词向量的维度为100
# min_count=5, # 频数
# window=3, #
# workers=cpu_count,
# iter=5)
# w2indx, w2vec, text_data, power_text_data = text_w2model(model, all_words, max_len=15)
#
# print("最大长度为", max_len)
# print("img_path", len(img_path))
# print(f"txt {len(txt_data)}")
# print("label", len(label))
# print("词", len(all_words))
# print(all_words[:3])