In [1]:
import lxml
import newspaper
import mitie
import pandas as pd

In [2]:
usatoday = newspaper.build("https://www.usatoday.com/")
washpost = newspaper.build("https://www.washingtonpost.com/")
atlantic = newspaper.build("https://www.theatlantic.com/")
newyorker = newspaper.build("https://www.newyorker.com/")

In [3]:
articles = []
for i in range(10):
    articles.append(usatoday.articles[i])
    articles.append(washpost.articles[i])
    articles.append(atlantic.articles[i])
    articles.append(newyorker.articles[i])

In [4]:
texts = []
for article in articles:
    article.download()
    article.parse()
    texts.append((article.title, article.text))

In [5]:
import sys, os

In [6]:
sys.path.append(os.getenv('HOME') + '/Packages/MITIE/mitielib')

In [8]:
tokens = [mitie.tokenize(text[1]) for text in texts]


In [9]:
ner = mitie.named_entity_extractor(os.getenv('HOME') + '/Packages/MITIE/MITIE-models/english/ner_model.dat')

In [10]:
entities_list = [ner.extract_entities(token) for token in tokens]

In [11]:
print(len(entities_list[0]))
print(entities_list[0])

1
[(range(25, 27), 'PERSON', 1.516026975594067)]


In [12]:
entity_texts = []
for index, entities in enumerate(entities_list):
    token = tokens[index]
    temp = []
    for e in entities:
        width = e[0]
        tag = e[1]
        score = e[2]
        score_text = "{:0.3f}".format(score)
        entity_text = " ".join(token[i].decode() for i in width)
        temp.append(entity_text)
        #print("   Score: " + score_text + ": " + tag + " " + entity_text)
        
    entity_texts.append(temp)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tf = [" ".join(t) for t in entity_texts]

In [15]:
vect = TfidfVectorizer()

In [16]:
tfidf = vect.fit_transform(tf)

In [17]:
terms = vect.get_feature_names()

In [18]:
pairwise_sim = (tfidf * tfidf.T).A

In [19]:
from sklearn.cluster import SpectralClustering

In [32]:
SpectralClustering?

In [33]:
spectral = SpectralClustering(n_clusters = 10, eigen_solver='lobpcg', affinity='precomputed')

In [34]:
spectral.fit(pairwise_sim)



SpectralClustering(affinity='precomputed', assign_labels='kmeans', coef0=1,
          degree=3, eigen_solver='lobpcg', eigen_tol=0.0, gamma=1.0,
          kernel_params=None, n_clusters=10, n_init=10, n_jobs=1,
          n_neighbors=10, random_state=None)

In [22]:
# print("Terms per cluster:")
# for i in __builtins__.range(8):
#     print("Cluster %d:" % i),
#     T=t[spectral.labels_==i].indices
#     for ind in T:
#         print(terms[ind])
#     print("")

In [35]:
spectral.labels_

array([6, 8, 6, 0, 7, 8, 2, 1, 6, 8, 2, 6, 9, 6, 6, 6, 6, 0, 6, 2, 6, 5,
       1, 6, 6, 7, 1, 6, 6, 3, 6, 7, 3, 6, 1, 1, 1, 4, 0, 2], dtype=int32)

In [36]:
mydict = {'title': [text[0] for text in texts], 'label':spectral.labels_}
df = pd.DataFrame(mydict)

In [39]:
df[df.label == 0].title.tolist()

['Why the United States Needs More Immigrants',
 '“I think I was being sent a message”: U.S. warned U.N. report on poverty in America could have consequences',
 'The 3 Reasons the U.S. Health-Care System Is the Worst']

In [40]:
df[df.label == 1].title.tolist()

['The Absurdity of Trump Officials Eating at Mexican Restaurants During an Immigration Crisis',
 'The Atlantic Politics & Policy Daily: Do U Care?',
 "How Do You Know When It's Officially a Trade War?",
 'How Some Immigrant Families Are Avoiding Separation',
 'Interrogating Melania Trump’s Statement Jacket and Its Fast-Fashion Fascism',
 "'Womp, womp': Speaker's bureau dumps Lewandowski after he mocks migrant with Down syndrome"]

In [41]:
df[df.label == 3].title.tolist()

['ABC orders spinoff of ‘Roseanne’ without Roseanne Barr',
 "'Roseanne' spinoff: ABC picks up 'The Conners,' minus Roseanne Barr"]

In [42]:
df[df.label == 6].title.tolist()

['Scientists warn a huge solar storm could send us back to the dark ages',
 'Photos of the Week: Smoggy Santiago, Miniature Taipei, Mermaid Parade',
 "Imagine Dragons' Dan Reynolds on HBO doc, and why he's no longer embarrassed to be Mormon",
 'A Physician in South Texas on an Unnerving Encounter with an Eight-Year-Old Boy in Immigration Detention',
 'Backup driver in fatal self-driving Uber crash was streaming Hulu',
 'Will Erdoğan Cheat His Way to Victory?',
 'Octavia Butler’s Prescient Vision of a Zealot Elected to “Make America Great Again”',
 "Princess Charlotte is 'obsessed' with fashion",
 'The Outrage Cycle, Italian Style',
 "Your sunscreen is probably expired—and it's time to upgrade",
 'Anthony Bourdain’s Moveable Feast',
 'The US used to ship 4,000 recyclable containers a day to China. Where will the banned trash go now?',
 'Patrick Melrose and the Fall of the English Élite',
 'Keystone virus makes first jump from mosquitoes to humans with confirmed case in Florida teen',
 "