-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
59 lines (50 loc) · 1.86 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from nltk import tokenize
from nltk import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
stop_words = None
def preprocessing_en(text):
# Word tokenization
tokenizer = tokenize.TreebankWordTokenizer()
tokens = tokenizer.tokenize(text.lower())
# Normalization
lemmatizer = WordNetLemmatizer()
tagged_tokens = pos_tag(tokens)
lemma_list = []
for word, tag in tagged_tokens:
wordnet_tag = map_postag_into_wordnet(tag)
lemma = lemmatizer.lemmatize(word, pos=wordnet_tag)
lemma_list.append(lemma)
# Removing stopwords and punctuation
global stop_words
if stop_words is None:
stop_words = stopwords.words("english")
with open('stop_words_english.txt') as f:
stop_words += f.read().splitlines()
filtered_lemma_list = []
for word in lemma_list:
if word not in stop_words:
if word.isalpha() and len(word) > 3:
filtered_lemma_list.append(word)
return filtered_lemma_list
def map_postag_into_wordnet(postag):
# input: value from pos_tag
# output: value for WordNet lemmatizer
# mapping logic:
# pos_tags that begin with J are adjectives
# with V are verbs
# with N are nouns
# with R are adverbs
#
# Create a dictionary with the mapping:
tag_map = {"j": wordnet.ADJ,
"n": wordnet.NOUN,
"v": wordnet.VERB,
"r": wordnet.ADV}
# Create a default option, to be used when the mapping fails:
default_pos = wordnet.NOUN
# Now return the value for the appropriate key (key = 1st letter of the postag)
# if the key is not found, return the chosen default
wordnet_tag = tag_map.get(postag[0].lower(), default_pos)
return wordnet_tag