In [10]:
# For use with postgresql data import
# !pip3 install psycopg2
import psycopg2

# For named entity recognition
# !python3 -m spacy download en_core_web_sm
import spacy
nlp = spacy.load("en_core_web_sm")

from bs4 import BeautifulSoup

import pandas as pd
from collections import Counter

# !pip install inflection
import inflection

In [11]:
# Connect to the DB and prepare the stories

conn = psycopg2.connect("dbname=techday_development user=seanm")
cur = conn.cursor()
cur.execute("SELECT stories.slug, CONCAT_WS('. ', title, intro, body) AS corpora, STRING_AGG(tags.name, ', ') AS tags FROM stories INNER JOIN taggings ON stories.id = taggings.story_id INNER JOIN tags ON taggings.tag_id = tags.id GROUP BY stories.slug, title, intro, body, publish_timestamp ORDER BY random() LIMIT 5;")
corpus = cur.fetchall()
conn.close()

In [12]:
# Check a corpora from the corpus

corpus[0]

('moustache-republic-expands-aussie-presence-with-new-exec',
 'Moustache Republic expands Aussie presence with new exec. The Kiwi digital commerce partner has appointed a Sydney-based director to oversee the expansion of the company’s Australian footprint.. <p>Moustache Republic, a digital commerce partner, has expanded its executive team in Australia with the appointment of Laura Doonin as Partner and Director.</p>\r\n\r\n<p>Based in Sydney, Doonin joins Moustache Republic with 15 years&rsquo; experience working with retail businesses during which time she has developed and implemented ecommerce and omnichannel solutions to meet the fast evolving expectations of B2B and B2C organisations.</p>\r\n\r\n<p>In her role at Moustache Republic, Doonin will be responsible for expanding the company&rsquo;s Australian market footprint and supporting retailers in delivering ecommerce solutions based on great user design and leading edge SaaS technologies.</p>\r\n\r\n<p>Doonin was previously digit

In [13]:
for corpora in corpus:
    # Print the website address
    print("https://techday.asia/story/" + corpora[0])
    
    # Print the corpus word count
    print("Corpora word count = ", len(corpora[1].split()), '\n')
    
    # Remove HTML
    corpora_sans_html = BeautifulSoup(corpora[1], "lxml").text
    
    # Removing quotes and brackets
    final_corpora = corpora_sans_html.replace("'","").replace('"',"").replace('(',"").replace(')',"")
    
    # Create instance of the model
    doc = nlp(final_corpora)
    
    # Filter down to raw named entities
    raw = []
    raw = [ent.text for ent in doc.ents] 

    # Iterate through named entities
    names = []
    for ent in doc.ents:
        
         # Filter our unneeded named entities
        if ent.label_ == "ORG" or ent.label_ == "CARDINAL" and ent.label_ == "PERSON":
            
            # Singularize the word
            word = inflection.singularize(ent.text)
            
            # Capitalize the first letter without impacting the rest
            names.append(word[0].capitalize() + word[1:])
            
    # Identify the most common named entities and counts
    final_ml = Counter(names).most_common()
    
    # Print the comparisons
    
    # print("Raw ML tags =", raw, '\n')
    print("ML tags =", final_ml, '\n')
    print("Human tags =", [corpora[2]], '\n\n')

https://techday.asia/story/moustache-republic-expands-aussie-presence-with-new-exec
Corpora word count =  372 

ML tags = [('Moustache Republic', 4), ('EBay', 4), ('Pitney Bowe', 2), ('Aussie', 1), ('Kiwi', 1), ('Pharmacy 4 Less', 1), ('Fashion Vertical', 1), ('Retail Global Event', 1), ('Bachelor of Science Honour', 1), ('Heriot-Watt University', 1), ('Digital commerce', 1), ('Omni-channel', 1)] 

Human tags = ['Job appointments, e-commerce, Australian expansion, Moustache Republic'] 


https://techday.asia/story/aussie-data-centre-centrify-demand-local-idaas-grows
Corpora word count =  393 

ML tags = [('Centrify', 3), ('Aussie', 1), ('Centrify ANZ', 1), ('The Centrify Identity Service CI', 1), ('Centrify Privilege Service', 1), ('Niall King', 1), ('Gartner', 1), ('Amazon Cloud Service', 1), ('King', 1)] 

Human tags = ['IDaaS, Centrify'] 


https://techday.asia/story/familiar-face-manage-microsofts-aussie-surface-channel
Corpora word count =  298 

ML tags = [('Microsoft', 8), ('Vie