In [5]:
# Import libraries
from bs4 import BeautifulSoup
import requests as req
from urllib3.util import parse_url

def getHost(url):
    return parse_url(url).host

def getScheme(url):
    return parse_url(url).scheme

def initializeSoup(url):
    request = req.get(url)
    response = request.text
    return BeautifulSoup(response, 'lxml')
    
def getLinks(url):
    square = initializeSoup(url).find_all("div",class_ = "square")
    return [getScheme(url)+"://"+getHost(url)+link.a.get('href')  for link in square]

def preprocessData(link):
    soup = initializeSoup(link)
    finalContent = soup.find(id = 'divTitle').get_text()
    contentList = soup.find_all(class_= 'storyParagraph')
    for content in contentList:
        finalContent += content.get_text()
    return finalContent.replace('\xa0','').replace('\n','').replace('\r','').replace('\t','')

def downloadSummary(url):
    getLinks(url)
    summaryList =[preprocessData(link) for link in getLinks(url)]
    return summaryList
    
# http://www.newser.com/section/7/technology-news-headlines.html
# http://www.newser.com/section/6/science-news-headlines.html

url = input("Enter URL")
articleData = downloadSummary(url)
print(articleData)

Enter URLhttp://www.newser.com/section/7/technology-news-headlines.html
['                            Insulted Congressman Sues Twitter for $250M                                                                    (Newser)                                        –                                        Republican Rep. Devin Nunes is tired of being insulted by Twitter accounts like Devin Nunes\' Cow, which calls him a "lil\' treasonous cowpoke," and the now-suspended Devin Nunes\' Mom. In a $250 million lawsuit against the website and some of its users, including the "Cow" and "Mom" accounts, the California lawmaker accuses Twitter of ignoring complaints about defamatory material, the Verge reports. He accuses the website of censoring conservative viewpoints and "shadow-banning conservatives" by causing their names to not appear as auto-suggestions. Engadget reports that Nunes blames Twitter for making his 2018 re-election closer than it should have been.                                  

In [80]:
# Clustering

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df = 0.5, min_df = 2, stop_words = 'english')
x = vectorizer.fit_transform(articleData)
x

<12x167 sparse matrix of type '<class 'numpy.float64'>'
	with 415 stored elements in Compressed Sparse Row format>

In [14]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 100, n_init = 1, verbose = True)

In [16]:
km.fit(x)

Initialization complete
Iteration  0, inertia 13.452
Iteration  1, inertia 7.228
Converged at iteration 1: center shift 0.000000e+00 within tolerance 4.875873e-07


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=3, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [19]:
import numpy as np
np.unique(km.labels_, return_counts = True)

(array([0, 1, 2]), array([3, 3, 6], dtype=int64))

In [83]:
text = {}
for i, cluster in enumerate(km.labels_):
    oneDocument = articleData[i]
    if cluster not in text.keys():
        text[cluster] = oneDocument
    else:
        text[cluster] += oneDocument
        
print(text)

{2: '                            Insulted Congressman Sues Twitter for $250M                                                                    (Newser)                                        –                                        Republican Rep. Devin Nunes is tired of being insulted by Twitter accounts like Devin Nunes\' Cow, which calls him a "lil\' treasonous cowpoke," and the now-suspended Devin Nunes\' Mom. In a $250 million lawsuit against the website and some of its users, including the "Cow" and "Mom" accounts, the California lawmaker accuses Twitter of ignoring complaints about defamatory material, the Verge reports. He accuses the website of censoring conservative viewpoints and "shadow-banning conservatives" by causing their names to not appear as auto-suggestions. Engadget reports that Nunes blames Twitter for making his 2018 re-election closer than it should have been.                                                                            The complaint filed in a Vi

In [25]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from collections import defaultdict
from heapq import nlargest

In [60]:

# Remove stop word
from nltk.corpus import stopwords 
from string import punctuation

stopwords = set(stopwords.words('english')+list(punctuation)+[ 'say', 'coming', 'could','million',"''",'``',"'d","n't","'s",'”','...','“', '’','–',"'ve",'millions','billion','billions','year','years','month','ago','including','cook','suv','months','day','days'])

In [62]:

# Tokenizing / Freq Dist 

keywords = {}
counts = {}
for cluster in range(3):
    word_sent = word_tokenize(text[cluster].lower())
    word_sent = [word for word in word_sent if word not in stopwords]
    freq =FreqDist(word_sent)
    counts[cluster] = freq
    keywords[cluster] = nlargest(20, freq, key =freq.get)


In [65]:
unique_keys = {}
for cluster in range(3):
    other_clusters = list(set(range(3))-set([cluster]))
    keys_other_clusters = set(keywords[other_clusters[0]]).union(set(keywords[other_clusters[1]]))                                                                 
    unique = set(keywords[cluster])- keys_other_clusters  
    unique_keys[cluster] = nlargest(10, unique, key = counts[cluster].get)

In [98]:
# Classification

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 3)

classifier.fit(x, km.labels_)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [111]:
# Clustering

data = ['                            Trump: Here\'s Why I Said \'Tim Apple\'                                                                    (Newser)                                        –                                        President Trump is doubling down and insisting that he did not, in fact, screw up the name of Apple CEO Tim Cook at a White House function. You can start by watching the original remarks here. On Twitter Monday, Trump provided an explanation of why he said "Tim Apple" during his remarks. "At a recent round table meeting of business executives, & long after formally introducing Tim Cook of Apple, I quickly referred to Tim + Apple as Tim/Apple as an easy way to save time & words," he wrote. "The Fake News was disparagingly all over this, & it became yet another bad Trump story!"                                                                            The tweet comes after a report in Axios that Trump offered a somewhat different explanation to Republican donors at his Mar-a-Lago club in Florida last week. The donors say Trump claimed he said "Tim Cook Apple" quickly and spoke the "Cook" part softly. Jonathan Swan at Axios says one donor told him: "I just thought, why would you lie about that? It doesn\'t even matter!" (Cook made light of the incident, changing his name on Twitter to Tim Apple.)']
y = vectorizer.transform(data)
y

<1x167 sparse matrix of type '<class 'numpy.float64'>'
	with 31 stored elements in Compressed Sparse Row format>

In [112]:
classifier.predict(y)

array([2])

In [115]:
# Classification

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()

classifier.fit(x, km.labels_)

nextdata = [ '                            From Amazon Warehouses, 189 Disturbing Calls                                                                    (Newser)                                        –                                        Stories about working conditions in Amazon warehouses have been circulating for years, and the Daily Beast is out with an investigation that adds a disturbing tangible element. Between October 2013 and October 2018, it found that 189 emergency calls had been placed from Amazon warehouses across the US because of "suicide attempts, suicidal thoughts, and other mental-health episodes." And the examination of 911 logs and such covered only 46 warehouses in 17 states, or only about 25% of the total number of warehouses. A quote from one former employee sums things up: “It’s this isolating colony of hell where people having breakdowns is a regular occurrence,” says Jace Crouch, who worked at a facility in Lakeland, Florida. It’s “mentally taxing to do the same task super fast for 10-hour shifts, four or five days a week.”                                                                            The investigation does not compare the numbers at Amazon to those at other companies or make the case that working conditions in the Amazon facilities are any worse than elsewhere. Also, some of the affected employees acknowledged having mental health issues before taking their jobs at Amazon, though they say the work exacerbated their issues. Still, the multitude of calls offers "a visceral, real-time glimpse of employees on the edge," per the story. Amazon responds that the focus on the total number of calls is an “overgeneralization” that “doesn’t take into account the total of our associate population, hours worked, or our growing network.” Click for the full story, which includes 911 transcripts of some of the actual calls.  (Read more Longform stories.)']
new_y = vectorizer.transform(nextdata)
classifier.predict(new_y)

array([2])