In [3]:
from urllib import request
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
import wikipedia
stop_words = set(stopwords.words('english'))

In [15]:
def createCorpus(data, labels):
    if type(labels) is list:
        for i in range(len(labels)):
            for j in range(3):
                try:
                    tokenizedText = word_tokenize(wikipedia.page(wikipedia.search(labels[i])[j]).content)
                except:
                    continue
                for w in tokenizedText:
                    w = w.lower()
                    if w not in stop_words and w not in punctuation:
                        if w.endswith("ies"):
                            w = w[:-3]
                            w = w + "y"
                        data.append(w)
    elif type(labels) is str:
        for i in range(4):
            try:
                tokenizedText = word_tokenize(wikipedia.page(wikipedia.search(labels)[i]).content)
            except:
                continue
            for w in tokenizedText:
                w = w.lower()
                if w not in stop_words and w not in punctuation:
                    if w.endswith("ies"):
                        w = w[:-3]
                        w = w + "y"
                    data.append(w)

In [16]:
#https://medium.com/@ezzatdemnati/web-scraping-news-data-rss-feeds-python-and-google-cloud-platform-7a0df2bafe44

def getArticleText(url):
    article = requests.get(url)
    articles = BeautifulSoup(article.content, 'html.parser')
    articles_body = articles.findAll('body')    
    p_blocks = articles_body[0].findAll('p')
    p_blocks_df=pd.DataFrame(columns=["element_name","parent_hierarchy","element_text","element_text_Count"])
    for i in range(len(p_blocks)):
        parents_list=[]
        for parent in p_blocks[i].parents:
            Parent_id = ''
            try:
                Parent_id = parent['id']
            except:
                pass
            parents_list.append(parent.name + 'id: ' + Parent_id)
        parent_element_list = ['' if (x == 'None' or x is None) else x for x in parents_list ]
        parent_element_list.reverse()
        parent_hierarchy = ' -> '.join(parent_element_list)
        p_blocks_df=p_blocks_df.append({"element_name":p_blocks[i].name
                                  ,"parent_hierarchy":parent_hierarchy
                                  ,"element_text":p_blocks[i].text
                                  ,"element_text_Count":len(str(p_blocks[i].text))}
                                  ,ignore_index=True
                                  ,sort=False)
    if len(p_blocks_df)>0:
        p_blocks_df_groupby_parent_hierarchy=p_blocks_df.groupby(by=['parent_hierarchy'])
        p_blocks_df_groupby_parent_hierarchy_sum=p_blocks_df_groupby_parent_hierarchy[['element_text_Count']].sum()            
        p_blocks_df_groupby_parent_hierarchy_sum.reset_index(inplace=True)
    maxid=p_blocks_df_groupby_parent_hierarchy_sum.loc[p_blocks_df_groupby_parent_hierarchy_sum['element_text_Count'].idxmax()
                                                     ,'parent_hierarchy']
    merge_text='\n'.join(p_blocks_df.loc[p_blocks_df['parent_hierarchy']==maxid,'element_text'].to_list())
    return merge_text

In [24]:
def createDataFrame(url, label):
    parseurl = urlopen(url)
    xml_page = parseurl.read()
    parseurl.close()

    page = BeautifulSoup(xml_page, "xml")
    news_list = page.findAll("item")
    
    colNames = []
    colNames.append("title")
    colNames.append("data")
    colNames.append("link")
    colNames.append("date")
    colNames.append("label")

    df = pd.DataFrame(columns=colNames)

    titles = []
    data = []
    links = []
    dates = []
    labels = []

    for getfeed in news_list:
        titles.append(getfeed.title.text)
        data.append(getArticleText(getfeed.link.text))
        links.append(getfeed.link.text)
        dates.append(getfeed.pubDate.text)
        labels.append(label)
        
    df = pd.DataFrame(list(zip(titles, data, links, dates, labels)), columns=["title", "data", "link", "date", "label"])
    #for e in df:
     #   if (len(df.loc[e].title) + len(df.loc[e].data) < 15):
      #      df.drop(e)
    return df

In [18]:
def createBigrams(bigramCorpus, corpusName):
    for word1, word2 in bigrams(corpusName):
        bigramCorpus[word1][word2] += 1
    for word1 in bigramCorpus:
        wordCount = float(sum(bigramCorpus[word1].values()))
        for word2 in bigramCorpus[word1]:
            bigramCorpus[word1][word2] /= wordCount

In [19]:
def createTrigrams(trigramCorpus, corpusName):
    for word1, word2, word3 in trigrams(corpusName):
        trigramCorpus[(word1,word2)][word3] += 1
    for word1_word2 in trigramCorpus:
        wordCount = float(sum(trigramCorpus[word1_word2].values()))
        for word3 in trigramCorpus[word1_word2]:
            trigramCorpus[word1_word2][word3] /= wordCount

In [32]:
def compare(df, index):
    corpusList = [newsCorpus, busCorpus, healthCorpus, edCorpus, sciCorpus, entCorpus]
    bigramList = [bigramNews, bigramBus, bigramHealth, bigramEd, bigramSci, bigramEnt]
    trigramList = [trigramNews, trigramBus, trigramHealth, trigramEd, trigramSci, trigramEnt]
    labelNames = ["news", "business", "health", "education", "science/environment", "entertainment/arts"]
    similarity = [0, 0, 0, 0, 0, 0]
    title = df.loc[index].title
    title = title.lower()
    tokenTitle = title.split(" ")
    data = df.loc[index]["data"]
    data = data.lower()
    tokenData = data.split(" ")
    titleBigrams = defaultdict(lambda: defaultdict(lambda: 0))
    titleTrigrams = defaultdict(lambda: defaultdict(lambda: 0))
    dataBigrams = defaultdict(lambda: defaultdict(lambda: 0))
    dataTrigrams = defaultdict(lambda: defaultdict(lambda: 0))
    for word1, word2 in bigrams(tokenTitle):
        titleBigrams[word1][word2] += 1
    for word1, word2 in bigrams(tokenData):
        dataBigrams[word1][word2] +=1
    for word1, word2, word3 in trigrams(tokenTitle):
        titleTrigrams[(word2, word2)][word3] += 1
    for word1, word2, word3 in trigrams(tokenData):
        dataTrigrams[(word1, word2)][word3] += 1
    for i in range(len(similarity)):
        count = 0;
        if labelNames[i] in tokenTitle or labelNames[i] in tokenData:
            count = count + 2
        
        #compare individual words
        for w in tokenTitle:
            if w in corpusList[i]:
                count = count + 1
        for w in tokenData:
            if w in corpusList[i]:
                count = count + 1        
                
        #compare bigrams
        for k in titleBigrams.keys():
            if k in bigramList[i].keys():
                for secWord in titleBigrams[k].keys():
                    if secWord in bigramList[i][k].keys():
                        count = count + 2
        
        for k in dataBigrams.keys():
            if k in bigramList[i].keys():
                for secWord in dataBigrams[k].keys():
                    if secWord in bigramList[i][k].keys():
                        count = count + 2
                        
        #compare trigrams
        for k in titleTrigrams.keys():
            if k in trigramList[i].keys():
                for thirdWord in titleTrigrams[k].keys():
                    if thirdWord in trigramList[i][k].keys():
                        count = count + 3
        for k in dataTrigrams.keys():
            if k in trigramList[i].keys():
                for thirdWord in dataTrigrams[k].keys():
                    if thirdWord in trigramList[i][k].keys():
                        count = count + 3
        
        similarity[i] = count
    ind = similarity.index(max(similarity))
    return labelNames[ind]

In [26]:
url = "http://feeds.bbci.co.uk/news/world/rss.xml"
label1 = "news"
df = createDataFrame(url, label1)

url = "http://feeds.bbci.co.uk/news/uk/rss.xml"
df2 = createDataFrame(url, label1)
df = df.append(df2)

url = "http://feeds.bbci.co.uk/news/business/rss.xml"
label3 = "business"
df3 = createDataFrame(url, label3)
df = df.append(df3)

url = "http://feeds.bbci.co.uk/news/politics/rss.xml"
df4 = createDataFrame(url, label1)
df = df.append(df4)

url = "http://feeds.bbci.co.uk/news/health/rss.xml"
label5 = "health"
df5 = createDataFrame(url, label5)
df = df.append(df5)

url = "http://feeds.bbci.co.uk/news/education/rss.xml"
label6 = "education"
df6 = createDataFrame(url, label6)
df = df.append(df6)

url = "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml"
label7 = "science/environment"
df7 = createDataFrame(url, label7)
df = df.append(df7)

url = "http://feeds.bbci.co.uk/news/technology/rss.xml"
df8 = createDataFrame(url, label7)
df = df.append(df8)

url = "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"
label9 = "entertainment/arts"
df9 = createDataFrame(url, label9)
df = df.append(df9)

In [27]:
cleanList = []
for s in df["title"]:
    words = s.split(" ")
    cleanSentence = []
    for w in words:
        w = w.lower()
        if w not in stop_words and w not in punctuation:
            if w.endswith("ies"):
                w = w[:-3]
                w = w + "y"
            cleanSentence.append(w)
    cleanSentence = " ".join(cleanSentence)
    cleanList.append(cleanSentence)
df["title"] = cleanList

cleanList = []
for s in df["data"]:
    words = s.split(" ")
    cleanSentence = []
    for w in words:
        w = w.lower()
        if w not in stop_words and w not in punctuation:
            if w.endswith("ies"):
                w = w[:-3]
                w = w + "y"
            cleanSentence.append(w)
    cleanSentence = " ".join(cleanSentence)
    cleanList.append(cleanSentence)
df["data"] = cleanList

In [28]:
bigramNews = defaultdict(lambda: defaultdict(lambda: 0))
trigramNews = defaultdict(lambda: defaultdict(lambda: 0))
newsCorpus = []
createCorpus(newsCorpus, "news")
createBigrams(bigramNews, newsCorpus)
createTrigrams(trigramNews, newsCorpus)

bigramBus = defaultdict(lambda: defaultdict(lambda: 0))
trigramBus = defaultdict(lambda: defaultdict(lambda: 0))
busCorpus = []
createCorpus(busCorpus, "business")
createBigrams(bigramBus, busCorpus)
createTrigrams(trigramBus, busCorpus)

bigramHealth = defaultdict(lambda: defaultdict(lambda: 0))
trigramHealth = defaultdict(lambda: defaultdict(lambda: 0))
healthCorpus = []
createCorpus(healthCorpus, "health")
createBigrams(bigramHealth, healthCorpus)
createTrigrams(trigramHealth, healthCorpus)

bigramEd = defaultdict(lambda: defaultdict(lambda: 0))
trigramEd = defaultdict(lambda: defaultdict(lambda: 0))
edCorpus = []
createCorpus(edCorpus, "education")
createBigrams(bigramEd, edCorpus)
createTrigrams(trigramEd, edCorpus)

bigramSci = defaultdict(lambda: defaultdict(lambda: 0))
trigramSci = defaultdict(lambda: defaultdict(lambda: 0))
sciCorpus = []
createCorpus(sciCorpus, "science")
createCorpus(sciCorpus, "environment")
createBigrams(bigramSci, sciCorpus)
createTrigrams(trigramSci, sciCorpus)

bigramEnt = defaultdict(lambda: defaultdict(lambda: 0))
trigramEnt = defaultdict(lambda: defaultdict(lambda: 0))
entCorpus = []
createCorpus(entCorpus, "entertainment")
createCorpus(entCorpus, "arts")
createBigrams(bigramEnt, entCorpus)
createTrigrams(trigramEnt, entCorpus)



  lis = BeautifulSoup(html).find_all('li')


In [33]:
index = list(range(len(df.index)))
df.index = index

table = pd.DataFrame(columns = ["title", "actual", "predicted"])

tList = []
aList = []
pList = []

correct = 0

for i in range(len(df.index)):
    tList.append(df.loc[i].title)
    aList.append(df.loc[i].label)
    comp = compare(df, i)
    pList.append(comp)
table.title = tList
table.actual = aList
table.predicted = pList

table

Unnamed: 0,title,actual,predicted
0,delhi factory fire: 40 dead india blaze,news,news
1,pensacola attack: gunman 'played mass-shooting...,news,news
2,north korea carry 'very important test',news,news
3,thousands join largest hk protest rally months,news,news
4,bob hawke 'asked daughter keep rape claim secret',news,news
...,...,...,...
253,original fyre festival?,entertainment/arts,entertainment/arts
254,turner prize: moment four nominees win prestig...,entertainment/arts,entertainment/arts
255,turner prize town builds bridges art,entertainment/arts,entertainment/arts
256,c3po actor: 'i left star wars publicity',entertainment/arts,science/environment


In [35]:
for i in range(len(table.index)):
    if(table.loc[i].actual == table.loc[i].predicted):
        correct = correct + 1

print("Accuracy: ", correct/len(table.index))

Accuracy:  0.46511627906976744


In [36]:
for i in range(len(table.index)):
    if table.loc[i].actual != table.loc[i].predicted:
        print(f"{table.loc[i].title}\t{table.loc[i].actual}\t{table.loc[i].predicted}")
    else:
        print("match")

match
match
match
match
match
artist eats $120,000 banana artwork	news	entertainment/arts
mike horn boerge ousland: north pole explorers complete epic trek	news	science/environment
uganda floods: least 16 people dead, red cross says	news	science/environment
match
russia doping: athletes wait fear fresh world ban	news	entertainment/arts
match
vienna opera house stages first opera woman	news	entertainment/arts
panipat: bollywood battle 18th century war	news	entertainment/arts
match
match
seychelles: island nation novel way tackle climate change	news	science/environment
match
match
indian ocean dipole: linked floods bushfires?	news	science/environment
hong kong pro-democracy rally: 'the streets full again'	news	entertainment/arts
match
cop25 climate change conference: giving environment?	news	education
burkina faso crisis: 'soldiers killed seven members family'	news	business
yinka ilori: nigerian narrative art design london	news	entertainment/arts
berea college: us university cracked stud