In [1]:
#for scraping data from HTML 
from bs4 import BeautifulSoup

#for pulling , pushing and authenticating 
import requests

#for carrying out regular expressions operations
import re

import operator

import json

from tabulate import tabulate
import sys
from stop_words import get_stop_words

In [2]:
#get words
def getWords(link):
    list_of_words = []
    
    #data
    words = requests.get(link)
    #converting to plain text
    ptext = words.text
    #converting to lxml format
    soup = BeautifulSoup(ptext,'lxml')
    
    #get words from paragraphs
    for text in soup.findAll('p'):
        if text.text is None:
            continue
        
        content = text.text
        
        #to lowercase
        words_lowercase = content.lower().split()
        
        #word-wise
        for each_word in words_lowercase:
            clean_word = cleanWord(each_word)
            if(len(clean_word)>0):
                list_of_words.append(clean_word)
                
    return list_of_words

In [3]:
#function to clean words using regex
def cleanWord(words):
    clean_words = re.sub('[^A-Za-z]+','',words)
    return clean_words

In [4]:
#removing stop Words
def remove_stopwords(word_list):
    stop_words = get_stop_words('en')
    
    temp = []
    for i,j in word_list:
        if i not in stop_words:
            temp.append([i,j])
    
    return temp

In [5]:
def frequencyTable(word_list):
    #word count
    word_count = {}
    for word in word_list:
        #index is the word
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

    return word_count

In [7]:
api_link = "https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch="
website_link = "https://en.wikipedia.org/wiki/"

if (len(sys.argv)<2):
    print "Enter valid string"
    exit()

#get the search word
query = raw_input("\n Enter term term to be searched in WIKI:   ")

yes_no = raw_input("\n DO you want to remove the stop words? 'yes' or 'no' :   ")
if (len(yes_no)>2):
    search_mode = True
else:
    search_mode = False
    
#URL
url = api_link + query


try:
    response = requests.get(url)
    data = json.loads(response.content.decode("utf-8"))
    
    #format data
    page_tag = data['query']['search'][0]['title']
    
    #recreate new URL
    new_url = website_link + page_tag
    word_list_from_page = getWords(new_url)
    
    #get word counts
    page_word_count = frequencyTable(word_list_from_page)
    #sort the word freq list
    sorted_list = sorted(page_word_count.items(),key = operator.itemgetter(1),reverse = True)
    
    #remove stop words
    if (search_mode):
        sorted_list = remove_stopwords(sorted_list)
        
    total_words_sum = 0
    for key,value in sorted_list:
        total_words_sum = total_words_sum + value
    
    #get top 20 words
    if len(sorted_list) > 20:
        sorted_list = sorted_list[:20]
        
    final_list = []
    for key,value in sorted_list:
        percent_value = float(value*100)/total_words_sum
        final_list.append([key,value,round(percent_value,4)])
        
    print_headers =['Word','Frequency', 'Frequency Percentage']
    
    #print the table with tabulate
    print(tabulate(final_list, headers=print_headers, tablefmt='orgtbl'))
    
except requests.exceptions.Timeout:
    print("The server didn't respond. Please, try again later.")



 Enter term term to be searched in WIKI:   shah rukh khan

 DO you want to remove the stop words? 'yes' or 'no' :   yes
| Word        |   Frequency |   Frequency Percentage |
|-------------+-------------+------------------------|
| khan        |         139 |                 3.5577 |
| film        |          66 |                 1.6893 |
| films       |          30 |                 0.7679 |
| india       |          29 |                 0.7423 |
| indian      |          28 |                 0.7167 |
| performance |          27 |                 0.6911 |
| khans       |          27 |                 0.6911 |
| bollywood   |          26 |                 0.6655 |
| best        |          25 |                 0.6399 |
| role        |          25 |                 0.6399 |
| filmfare    |          24 |                 0.6143 |
| actor       |          21 |                 0.5375 |
| million     |          21 |                 0.5375 |
| first       |          21 |                 0.5375 |

In [None]:
print new_url

In [None]:
print page_tag

In [None]:
print website_link