In [10]:
from bs4 import BeautifulSoup
import requests
import re
import operator
import json
from tabulate import tabulate
import sys
from stop_words import get_stop_words
from flask import jsonify
import numpy as np

In [3]:
def getWordList(url):
    word_list = []
    #raw data
    source_code = requests.get(url)
    #convert to text
    plain_text = source_code.text
    #lxml format
    soup = BeautifulSoup(plain_text,'lxml')

    #find the words in paragraph tag
    for text in soup.findAll('p'):
        if text.text is None:
            continue
        #content
        content = text.text
        #lowercase and split into an array
        words = content.lower().split()

        #for each word
        for word in words:
            #remove non-chars
            cleaned_word = clean_word(word)
            #if there is still something there
            if len(cleaned_word) > 0:
                #add it to our word list
                word_list.append(cleaned_word)

    return word_list

In [4]:
def clean_word(word):
    cleaned_word = re.sub('[^A-Za-z]+', ' ', word)
    return cleaned_word


def createFrquencyTable(word_list):
    #word count
    word_count = {}
    for word in word_list:
        #index is the word
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

    return word_count

#remove stop words
def remove_stop_words(frequency_list):
    stop_words = get_stop_words('en')

    temp_list = []
    for key,value in frequency_list:
        if key not in stop_words:
            temp_list.append([key, value])

    return temp_list

In [5]:
wikipedia_api_link = "https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch="
wikipedia_link = "https://en.wikipedia.org/wiki/"

In [6]:
def wiki_title_extractor(title= "love"):
    url = wikipedia_api_link + title
    response = requests.get(url)
    data = json.loads(response.content.decode("utf-8"))
    titles = [data['query']['search'][i]['title'] for i in range(len(data['query']['search']))]
    return titles

In [7]:
wiki_title_extractor('isro')

[u'Indian Space Research Organisation',
 u'List of ISRO missions',
 u'ISRO Orbital Vehicle',
 u'ISRO Satellite Integration and Testing Establishment',
 u'NISAR (satellite)',
 u'Indian Space Research Organisation Satellite Centre',
 u'ISRO Telemetry, Tracking and Command Network',
 u'ISRO Propulsion Complex',
 u'ISRO Pad Abort Test',
 u'ISRO Inertial Systems Unit']

In [8]:
def wiki_content_extractor(title="JK rowling"):
    titles = wiki_title_extractor(title=title)
    bucket = []
    for title in range(len(titles)):
        url = wikipedia_link + titles[title]
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text,'lxml')
        for text in soup.findAll('p'):
            bucket.append(text.text.lower())
        moreLine = [clean_word(bucket[i].split('.')[0]) for i in range(len(bucket))]
        spli = [np.array(moreLine[i].split(' ')) for i in range(len(moreLine))]
        frq = createFrquencyTable(np.concatenate(spli))
        sort = sorted(frq.items(), key=operator.itemgetter(1), reverse=True)
        frequency = remove_stop_words(sort)[:30]
    return frequency

In [11]:
bucket_jk = wiki_content_extractor(title='isro')

In [27]:
len(bucket_jk)

30

In [35]:
ss = {'label':[bucket_jk[i][0] for i in range(len(bucket_jk))], 'freq': [bucket_jk[i][1] for i in range(len(bucket_jk))]}

In [39]:
ss['label']

[u'isro',
 u'',
 u'space',
 u'indian',
 u'satellite',
 u'india',
 u's',
 u'launch',
 u'satellites',
 u'launched',
 u'vehicle',
 u'first',
 u'research',
 u'mission',
 u'orbit',
 u'will',
 u'organisation',
 u'earth',
 u'mars',
 u'gslv',
 u'spacecraft',
 u'system',
 u'centre',
 u'orbiter',
 u'missions',
 u'crew',
 u'development',
 u'successfully',
 u'antrix',
 u'known']

In [12]:
aa = [bucket_jk[i][0] for i in range(len(bucket_jk))]
aa

[u'isro',
 u'',
 u'space',
 u'indian',
 u'satellite',
 u'india',
 u's',
 u'launch',
 u'satellites',
 u'launched',
 u'vehicle',
 u'first',
 u'research',
 u'mission',
 u'orbit',
 u'will',
 u'organisation',
 u'earth',
 u'mars',
 u'gslv',
 u'spacecraft',
 u'system',
 u'centre',
 u'orbiter',
 u'missions',
 u'crew',
 u'development',
 u'successfully',
 u'antrix',
 u'known']

In [107]:
spli = [np.array(bucket_jk[i].split(' ')) for i in range(len(bucket_jk))]
frq = createFrquencyTable(np.concatenate(spli))
sort = sorted(frq.items(), key=operator.itemgetter(1), reverse=True)
remove_stop_words(sort)[:20]

[[u'sports', 300],
 [u'sky', 162],
 [u'', 82],
 [u's', 81],
 [u'sport', 52],
 [u'channel', 43],
 [u'games', 40],
 [u'live', 36],
 [u'game', 36],
 [u'football', 35],
 [u'broadcast', 32],
 [u'league', 31],
 [u'rights', 30],
 [u'also', 30],
 [u'news', 28],
 [u'radio', 26],
 [u'launched', 26],
 [u'coverage', 26],
 [u'hd', 26],
 [u'cup', 23]]

In [58]:
xc = clean_word(bucket_jk[0].split('.')[0])
ss = createFrquencyTable(xc.split(' '))
sa = sorted(ss.items(), key=operator.itemgetter(1), reverse=True)
#remove_stop_words(sa)
xc

u'Sport UK or sports US are all usually forms of competitive physical activity or games which through casual or organised participation aim to use maintain or improve physical ability and skills while providing enjoyment to participants and in some cases entertainment for spectators'

In [30]:
string_query = "love"
url = wikipedia_api_link + string_query
url

'https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch=love'

In [31]:
response = requests.get(url)
data = json.loads(response.content.decode("utf-8"))

In [32]:
title = data['query']['search'][0]['title']
titles = [data['query']['search'][i]['title'] for i in range(len(data['query']['search']))]
titles

[u'Love',
 u'Love (disambiguation)',
 u'Love Love Love',
 u'Unrequited love',
 u'Love Love Love (Agnes song)',
 u'Love triangle',
 u'Love for Love',
 u'Love song',
 u'Romance (love)',
 u'Puppy love']

In [33]:
page_url = wikipedia_link + title
page_url

u'https://en.wikipedia.org/wiki/Love'

In [34]:
page_word = getWordList(page_url)
len(page_word)

5958

In [35]:
page_word_count = createFrquencyTable(page_word)
len(page_word_count)

1930

In [36]:
sorted_word_frequency_list = sorted(page_word_count.items(), key=operator.itemgetter(1), reverse=True)
remove_stop_words(sorted_word_frequency_list)

[[u'love', 172],
 [u'love ', 67],
 [u' ', 44],
 [u'god', 27],
 [u'one', 26],
 [u'also', 21],
 [u'can', 18],
 [u'often', 18],
 [u'romantic', 17],
 [u'used', 17],
 [u'god ', 16],
 [u'sexual', 15],
 [u'word', 14],
 [u'human', 14],
 [u' love ', 14],
 [u'people', 13],
 [u'term', 13],
 [u'different', 13],
 [u'three', 11],
 [u'considered', 11],
 [u'many', 10],
 [u'includes', 10],
 [u'christian', 10],
 [u'greek', 9],
 [u'ai', 9],
 [u'interpersonal', 9],
 [u'person', 9],
 [u' love', 9],
 [u'like', 9],
 [u'life', 8],
 [u'form', 8],
 [u'loving', 8],
 [u'you ', 8],
 [u'one s', 8],
 [u'two', 8],
 [u' i', 8],
 [u'another ', 8],
 [u'theories', 7],
 [u'concept', 7],
 [u'attraction', 7],
 [u'life ', 7],
 [u'refers', 7],
 [u'common', 7],
 [u'feeling', 7],
 [u'however ', 7],
 [u' to', 7],
 [u'sufism', 7],
 [u'material', 7],
 [u'others ', 6],
 [u'seen', 6],
 [u'desire', 6],
 [u'corresponding', 6],
 [u'may', 6],
 [u'forms', 6],
 [u'commitment ', 6],
 [u'passionate', 6],
 [u'consider', 6],
 [u'chinese', 6],

In [54]:
source_code = requests.get(page_url)
    #convert to text
plain_text = source_code.text
    #lxml format
soup = BeautifulSoup(plain_text,'lxml')

In [55]:
bucket = []
for text in soup.findAll('p'):
    bucket.append(text.text)

In [60]:
bucket

[u'Love is a variety of different feelings, states, and attitudes that ranges from interpersonal affection ("I love my mother") to pleasure ("I loved that meal"). It can refer to an emotion of a strong attraction and personal attachment.[1] It can also be a virtue representing human kindness, compassion, and affection\u2014"the unselfish loyal and benevolent concern for the good of another".[2] It may also describe compassionate and affectionate actions towards other humans, one\'s self or animals.[3]',
 u'Non-Western traditions have also distinguished variants or symbioses of these states; words like storge, philia, eros, and agape each describe a unique "concept" of love.[4] Love has additional religious or spiritual meaning\u2014notably in Abrahamic religions. This diversity of uses and meanings combined with the complexity of the feelings involved makes love unusually difficult to consistently define, compared to other emotional states.',
 u'Love in its various forms acts as a majo

In [110]:
wordlist = [bucket[0].lower().split() for i in range(len(bucket))]
wordlist[1]

[u'romance',
 u'is',
 u'the',
 u'expressive',
 u'and',
 u'pleasurable',
 u'feeling',
 u'from',
 u'an',
 u'emotional',
 u'attraction',
 u'towards',
 u'another',
 u'person',
 u'often',
 u'associated',
 u'with',
 u'sexual',
 u'attraction.',
 u'it',
 u'is',
 u'eros',
 u'rather',
 u'than',
 u'agape,',
 u'philia,',
 u'or',
 u'storge.']