-
Notifications
You must be signed in to change notification settings - Fork 1
/
LangProcessor.py
67 lines (55 loc) · 1.64 KB
/
LangProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import nltk
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
class LangProcessor():
'''
Process through tweets
'''
def get_pos(self, query, tweets):
'''
Takes in list of tokenized tweets and finds the most
commonly assigned part of speech to the slang word
'''
pos_dict = {}
for tweet in tweets:
if query in tweet:
query_index = tweet.index(query)
sent_pos = nltk.pos_tag(tweet)
query_pos = sent_pos[query_index][1]
if query_pos in pos_dict.keys():
pos_dict[query_pos] += 1
else:
pos_dict[query_pos] = 1
freq_key = ''
freq_val = 0
for key in pos_dict.keys():
if pos_dict[key] > freq_val:
pos_dict[key] = freq_val
freq_key = key
return freq_key
def addtoCorpus(self, textfile, tweets):
with open(textfile, "a") as file:
for tweet in tweets:
for word in tweet:
if word not in stop_words and not word.isdigit():
file.write(word + ' ')
file.write('\n')
file.close()
def build_model(self, corpus, query):
model = Word2Vec(corpus) # train a model from the corpus
result = model.most_similar(query)
# import pdb;pdb.set_trace()
return result
def generateDefinition(self, word, pos, synonyms):
return word + ' relates to ' + synonyms[0] + ', '+ synonyms[1] + ', or ' + synonyms[2]
def main():
return
if __name__ == "__main__":
# calling main function
main()