-
Notifications
You must be signed in to change notification settings - Fork 0
/
bag_of_words.py
110 lines (97 loc) · 2.82 KB
/
bag_of_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import nltk
import numpy as np
import random
import string
import bs4 as bs
import urllib.request
import re
import heapq
from IPython.display import display, HTML
import pandas as pd
from nltk.corpus import stopwords
nltk.download('stopwords')
datas = pd.read_csv('datacrawler.csv')
s = datas['Feature'].unique()
list_text_per_feature = []
um_text = ""
wb_text = ""
ac_text = ""
vc_text = ""
ss_text = ""
cb_text = ""
ls_text = ""
for a,b,c,d in zip(datas['Feature'],datas['Desc'],datas['Sub Features'],datas['Sub Desc']):
if a == s[0]:
um_text = um_text + " " + a + " " + b + " " + c + " " + d
if a == s[1]:
ls_text = ls_text + " " + a + " " + b + " " + c + " " + d
if a == s[2]:
vc_text = vc_text + " " + a + " " + b + " " + c + " " + d
if a == s[3]:
ac_text = ac_text + " " + a + " " + b + " " + c + " " + d
if a == s[4]:
cb_text = cb_text + " " + a + " " + b + " " + c + " " + d
if a == s[5]:
ss_text = ss_text + " " + a + " " + b + " " + c + " " + d
if a == s[6]:
wb_text = wb_text + " " + a + " " + b + " " + c + " " + d
#
list_text_per_feature.append(um_text)
list_text_per_feature.append(ac_text)
list_text_per_feature.append(vc_text)
list_text_per_feature.append(wb_text)
list_text_per_feature.append(ss_text)
list_text_per_feature.append(ls_text)
list_text_per_feature.append(cb_text)
#
#
#
list_corpus = []
for a in list_text_per_feature:
corpus = nltk.sent_tokenize(a)
print(corpus)
for i in range(len(corpus)):
corpus[i] = corpus[i].lower()
corpus[i] = re.sub(r'\W', ' ', corpus[i])
corpus[i] = re.sub(r'\s+', ' ', corpus[i])
list_corpus.append(corpus)
print(corpus)
print(list_corpus)
print("\n")
wordfreq = {}
for corpus in list_corpus:
for sentence in corpus:
tokens = nltk.word_tokenize(sentence)
for token in tokens:
if token not in wordfreq.keys():
wordfreq[token] = 1
print(token)
else:
wordfreq[token] += 1
print(wordfreq)
print("\n")
most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)
print(most_freq)
print("\n")
sentence_vectors = []
list_sentence_vectors = []
for corpus in list_corpus:
sentence_vectors = []
for sentence in corpus:
sentence_tokens = nltk.word_tokenize(sentence)
sent_vec = []
for token in most_freq:
if token in sentence_tokens:
sent_vec.append(1)
else:
sent_vec.append(0)
sentence_vectors.append(sent_vec)
list_sentence_vectors.append(sentence_vectors)
sentence_vectors = np.asarray(list_sentence_vectors)
print(sentence_vectors)
#
#
# print(sentence_vectors)
# with open("output.txt", "w") as txt_file:
# for line in sentence_vectors:
# txt_file.write(" ".join(str(line)) + "\n")