In [47]:
import numpy as np
import pandas as pd
import texthero as hero
import os
from newspaper import Article
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import scattertext as st
from pprint import pprint

import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
CATEGORY_MAPPINGS = {
    "The Hype": "Concerns & Hype",
    "The Panic": "Concerns & Hype",
    "The good coverage": "Advances & Business",
    "Expert Opinions & Discussion within the field": "Expert Opinions & Discussion within the field",
    "Explainers": "Explainers",
    "AI Advances": "Advances & Business",
    "AI Worries": "Concerns & Hype",
    "Advances & Business": "Advances & Business",
    "Concerns & Hype": "Concerns & Hype",
    "Analysis & Policy": "Analysis & Policy",
    "Mini Briefs": "Mini Briefs"
}

In [3]:
def parse_file(file_name):
    with open(file_name,'r') as f:
        current_category = None
        articles = []
        for line in f:
            for c in CATEGORY_MAPPINGS.keys():
                if c in line:
                    current_category = CATEGORY_MAPPINGS[c]
            if current_category and '[' in line and '(' in line:
                title = line.split('[')[1].split(']')[0]
                url = line.split('(')[1].split(')')[0]
                if len(title.split(' '))<4:
                    continue
                if ' - ' in line:
                    excerpt = line.split(' - ')[1].strip()
                else:
                    excerpt = ''
                article = Article(url)
                try: 
                    article.download()
                    article.parse()
                    authors = article.authors
                    date = article.publish_date
                    text = article.text
                    top_image = article.top_image
                    article.nlp()
                    keywords = article.keywords
                    summary = article.summary
                except:
                    authors=None
                    date=None
                    text=None
                    keywords=[]
                    summary=''
                articles.append([str(current_category), 
                                 title, 
                                 date, 
                                 url, 
                                 excerpt, 
                                 authors, 
                                 keywords, 
                                 summary,
                                 text])
    return articles

In [None]:
all_articles = []
category_counts = {}
for file_name in os.listdir('.'):
    if 'py' in file_name or 'this' in file_name:
        continue
    name_parts = file_name.split('.')[0].split('-')
    year = int(name_parts[0])
    month = int(name_parts[1])
    day = int(name_parts[2])
    edition = int(name_parts[3])
    articles = parse_file(file_name)
    all_articles+=articles
    for article in articles:
        if article[0] not in category_counts:
            category_counts[article[0]]=0
        category_counts[article[0]]+=1

In [26]:
print(len(all_articles))
print(category_counts)

1726
{'Expert Opinions & Discussion within the field': 170, 'Analysis & Policy': 181, 'Mini Briefs': 169, 'Advances & Business': 622, 'Concerns & Hype': 452, 'Explainers': 132}


In [11]:
df = pd.DataFrame(all_articles, columns =['category', 'title', 'date', 'url', 'excerpt', 'authors', 'keywords', 'summary', 'text']) 

In [37]:
categories = df['category'].unique()

In [66]:
df['pca'] = (
   df['text']
   .pipe(hero.clean)
   .pipe(hero.tfidf)
   .pipe(hero.pca)
)
hero.scatterplot(df, 'pca', color='category', title="AI News")

In [67]:
df['pca'] = (
   df['keywords']
   .pipe(hero.clean)
   .pipe(hero.tfidf)
   .pipe(hero.pca)
)
hero.scatterplot(df, 'pca', color='category', title="AI News")

In [68]:
df['pca'] = (
   df['excerpt']
   .pipe(hero.clean)
   .pipe(hero.tfidf)
   .pipe(hero.pca)
)
hero.scatterplot(df, 'pca', color='category', title="AI News")

In [69]:
nlp = spacy.load('en')
corpus = st.CorpusFromPandas(df, 
                              category_col='category', 
                              text_col='text',
                              nlp=nlp).build().compact(st.AssociationCompactor(2500))

In [70]:
print('Top common words:')
pprint(list(corpus.get_scaled_f_scores_vs_background().index[:25]))

Top common words:
['ai',
 'openai',
 'facebook',
 'clearview',
 'deepmind',
 'twitter',
 'deepfakes',
 'waymo',
 'deepfake',
 'neural',
 'robots',
 'algorithmic',
 'imagenet',
 'youtube',
 'artificial',
 'coronavirus',
 'algorithms',
 'autonomous',
 'robotics',
 'datasets',
 'buzzfeed',
 'dataset',
 'adversarial',
 'facial',
 'lecun']


In [71]:
def make_cat_explorer(cat):
    html = st.produce_scattertext_explorer(corpus,
              category=cat,
              category_name=cat,
              not_category_name='Other',
              width_in_pixels=1000,)
    open("scatterplots/%s Viz.html"%cat, 'wb').write(html.encode('utf-8'))

In [None]:
for cat in categories:
    make_cat_explorer(cat)

In [None]:
def make_word_cloud(cat=None):
    comment_words = '' 
    stopwords = set(STOPWORDS) 

    # iterate through the csv file 
    for article in all_articles: 
        if cat is None or article[0]!=cat:
            continue
        val = article[1]
        # split the value 
        tokens = val.split() 

        # Converts each token into lowercase 
        for i in range(len(tokens)): 
            tokens[i] = tokens[i].lower() 

        comment_words += " ".join(tokens)+" "

    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white', 
                    stopwords = stopwords, 
                    min_font_size = 10).generate(comment_words) 

    # plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 

    plt.show() 