/
toponly.py
executable file
·53 lines (46 loc) · 1.88 KB
/
toponly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# considers only the words in the top 9,989,
# 1000, or 100 most frequently used words in
# the corpus.
from sources import *
import re
with open(topcorpuswords10000) as top10kfile:
top10kraw = top10kfile.readlines()
top10k = [i[:-1] for i in top10kraw]
with open(topcorpuswords5000) as top5kfile:
top5kraw = top5kfile.readlines()
top5k = [i[:-1] for i in top5kraw]
with open(topcorpuswords1000) as top1kfile:
top1kraw = top1kfile.readlines()
top1k = [i[:-1] for i in top1kraw]
with open(topcorpuswords100) as top100file:
top100raw = top100file.readlines()
top100 = [i[:-1] for i in top100raw]
def top(fulltext, howmany = None):
"""Returns a given text with only the words in the top 100, 1000, or 10,000 words in English, or all words (default)."""
if howmany == 10000:
toptext = [w for w in fulltext.lower().split() if w in top10k]
return ' '.join(toptext)
if howmany == 5000:
#top = [w for w in fulltext.lower().split() if w in top1k]
toptext = []
fulltext = re.findall(r"[\w']+|[!\"#$%&()*+,-./:;<=>?@\[\]^_`{|}~]",fulltext.lower())
for w in fulltext:
if w in top5k:
toptext.append(w)
returntext = ' '.join(toptext)
return returntext
if howmany == 1000:
#top = [w for w in fulltext.lower().split() if w in top1k]
toptext = []
fulltext = re.findall(r"[\w']+|[!\"#$%&()*+,-./:;<=>?@\[\]^_`{|}~]",fulltext.lower())
for w in fulltext:
if w in top1k:
toptext.append(w)
returntext = ' '.join(toptext)
#print(returntext[:50]) #my message word , email , , , , every year , i < blog > < date > , july ,
return returntext
if howmany == 100:
toptext = [w for w in fulltext.lower().split() if w in top100]
return ' '.join(toptext)
if howmany == None:
return fulltext