/
main.py
172 lines (136 loc) · 5.33 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#pip install -r requirements.txt
#BeautifulSoup is for pullling data from HTML for parsing later
from bs4 import BeautifulSoup
#requests is for pulling and pushing data from the web
import requests
#for performing regular expressions
import re
#is a wrapper around existing python functions i.e. addition
#makes things more readable
import operator
#json helps with parsing data
#is a format for parsing data
import json
#takes a lists of lists and displays a nicely formatted table
from tabulate import tabulate
#make system calls i.e. user input
import sys
#words that don't matter i.e. at, the, to
from stop_words import get_stop_words
##functions
#get the words
def getWordList(url):
wordList = []
#get raw data
sourceCode = requests.get(url)
#convert data to text
plainText = sourceCode.text
#lxml format
soup = BeautifulSoup(plainText, 'lxml')
#grab all the words
for article in soup.findAll():
if article.text is None:
continue
#content in article
content = article.text
#saves the article to a text file
#for checking purposes
#file = open('batmanarticle.txt', 'w')
#file.write(content.encode('utf-8'))
#file.close()
#convert each word to lowercase and split it into an array
words = content.lower().split()
#put cleaned words into an array
for word in words:
#remove non characters
#print(word)
cleanedWord = cleanWord(word)
if len(cleanedWord) > 0:
wordList.append(cleanedWord)
return wordList
def cleanWord(word):
#regex to get rid of any characters that are not letter
cleanedWord = re.sub('[^A-Za-z]+', '', word)
return cleanedWord
def createFrequencyTable(wordList):
wordCount = {}
for word in wordList:
#the index is the word
if word in wordCount:
wordCount[word] += 1
else:
wordCount[word] = 1
return wordCount
#remove stop words
def removeStopWords(frequencyList):
stopWords = get_stop_words('en')
tempList = []
for key, value in frequencyList:
if key not in stopWords:
tempList.append([key, value])
return tempList
#api link in json
wikipedia_api_link = "https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch="
wikipedia_link = "https://en.wikipedia.org/wiki/"
#get user input if it's the correct length
if (len(sys.argv) < 2):
print("Enter a valid string")
exit()
##This program will be run through the terminal as main.py batman yes
##if there are more that 2 arguments then remove stop words
#get the search word
string_query = sys.argv[1]
if (len(sys.argv) > 2):
search_mode = True
else:
search_mode = False
#create the URL
#this url returns all the wikipedia articles related to the search query
url = wikipedia_api_link + string_query
#make the request using a try/accept block
#handle any exceptions that occur
try:
#use requests library to get the url information
response = requests.get(url)
#use JSON to load the data from the response in UTF-8
data = json.loads(response.content.decode('utf-8'))
#format the data
#this retrieves words from the first article
#wikipediaPageTag stores the first link wikipedia provides
wikipediaPageTag = data['query']['search'][0]['title']
#create the new url that leads to the first link wikipedia provides
url = wikipedia_link + wikipediaPageTag
#getWordList() gets the words from the wikipedia page
pageWordList = getWordList(url)
print(pageWordList[0])
#createFrequencyTable() creates a frequency table of the words in page_word_list
pageWordCount = createFrequencyTable(pageWordList)
#sorts the frequency list from largest count to smallest count
sortedWordFrequencyList = sorted(pageWordCount.items(), key = operator.itemgetter(1), reverse = True)
#remove stop words if that option was specified in command line args
if(search_mode):
#remove_stop_words() uses regex to remove stop words
#returns as a list of lists
sortedWordFrequencyList = removeStopWords(sortedWordFrequencyList)
#sum the total words to calculate frequencies
totalWordSum = 0
for key, value in sortedWordFrequencyList:
#for all the words in the list get the sum
#the sum is the frequency count
totalWordSum = totalWordSum + value
#get the top 20 words if the list is greater than 20 words
if (len(sortedWordFrequencyList) > 20):
sortedWordFrequencyList = sortedWordFrequencyList[:20]
#final list, words + frequency + percentage
finalList = []
for key, value in sortedWordFrequencyList:
#how often the word appears as a percentage
percentageValue = float(value * 100) / totalWordSum
finalList.append([key, value, round(percentageValue, 4)])
printHeaders = ["Word", "Frequency", "Frequency Percentage"]
print(tabulate(finalList, headers = printHeaders, tablefmt = 'orgtbl'))
#throw an exception in case it breaks
except requests.exceptions.Timeout:
print("The server didn't respond. Please, try again later.")
except requests.exceptions.ConnectionError:
print("Too many requests. Connection refused")