## Implements a keyword or sentence taxonomy of topic classes.

In [1]:
import gensim, operator
from scipy import spatial
import numpy as np
from gensim.models import KeyedVectors

model_path = 'C:/Users/sherr/Downloads/'

In [2]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin.gz', True)
#model_fasttext = load_wordvec_model('FastText', 'fastText_wiki_en.vec', False)

Loading Word2Vec model...
Finished loading Word2Vec model...


In [3]:
topic_taxonomy = {
    "technology development":
    {
        "New Features":  "launch generation new beta land roll out",
        "Improvement":  "updates upgrade additional enhance",
    },
    "business company":
    {
        "Business Partnership":  "partner team collaboration agreement interaction",
        "Business Operation":  "production processing construction"
    },
    "health":
    {
        "Disease widespread": "pandemic transmission exposed infected",
        "Facility": "hospital clinic"
    },
}

## Classifies article titles against the developed taxonomy and groups by the topic classes

In [4]:
def vec_similarity(input1, input2, vectors):
    term_vectors = [np.zeros(300), np.zeros(300)]
    terms = [input1, input2]
        
    for index, term in enumerate(terms):
        for i, t in enumerate(term.split(' ')):
            try:
                term_vectors[index] += vectors[t]
            except:
                term_vectors[index] += 0
        
    result = (1 - spatial.distance.cosine(term_vectors[0], term_vectors[1]))
    if result is 'nan':
        result = 0
        
    return result

In [5]:
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [6]:
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    try:
        output = vectors.n_similarity(s1words, s2words)
    except:
        output = 0
    return output

In [7]:
def classify_topics(input, vectors):
    feed_score = dict()
    for key, value in topic_taxonomy.items():
        max_value_score = dict()
        for label, keywords in value.items():
            max_value_score[label] = 0
            topic = (key + ' ' + keywords).strip()
            max_value_score[label] += float(calc_similarity(input, topic, vectors))
            
        sorted_max_score = sorted(max_value_score.items(), key=operator.itemgetter(1), reverse=True)[0]
        feed_score[sorted_max_score[0]] = sorted_max_score[1]
    return sorted(feed_score.items(), key=operator.itemgetter(1), reverse=True)[:5]

In [8]:
import pandas as pd
apple = pd.read_json("/Users/sherr/Downloads/webhose_apple.json",lines = True)
titles = apple['title']

In [9]:
topic_score=[]
for i in titles:
    output= classify_topics(i,model_word2vec)
    topic_score.append(output)

In [11]:
topics=[]
for i in topic_score:
    topic=i[0][0]
    topics.append(topic)

In [12]:
scores=[]
for i in topic_score:
    score=i[0][1]
    scores.append(score)

In [14]:
apple['topic']=topics
apple['score']=scores
df_topic = apple[['title','topic','score']]
df_topic.sort_values(by=['topic','score'],ascending=False)

Unnamed: 0,title,topic,score
1470,Supply-chain backs iPhone 12 launch in October; early production for one model,New Features,0.665107
10348,Apple launches three innovative studies today in the new Research app - Apple,New Features,0.640826
1441,"iPhone 12 launch in october, says new supplychain report",New Features,0.640620
5917,OnePlus confirms its going to launch a more affordable smartphone soon,New Features,0.630278
7330,"India, 24 -- Apple is expected to launch its new AirPo",New Features,0.628129
307,"iPhone 12 launch in october, says new supply-chain report",New Features,0.627391
6099,OPPO A11x & OPPO A11 Android 10 (ColorOS 7) beta update early adopters recruitment begins,New Features,0.621966
8498,"Verizon rolls out 5G uploads, virtual lab for app development",New Features,0.621268
5363,OnePlus hints at launching new ‘affordable’ smartphone in India soon | Technology,New Features,0.608065
5331,"Switzerland is the first country to launch a large scale pilot for a COVID-19 contact tracing app, SwissCovid, using Apple's and Google's APIs (Christine Fisher/Engadget)",New Features,0.605213


In [16]:
df_unique=df_topic['topic'].unique()

## Top 10 matching titles for topics

In [13]:
pd.set_option('display.max_colwidth',-1)

In [17]:
for i in df_unique:
    print(i)
    top_10_titles = df_topic[df_topic['topic']==i].iloc[0:10,0]
    print(top_10_titles.to_string())
    print('----------------------------------------')

New Features
0     New iPad Air may come with USB-C not Lightning Port                                                    
1     iOS 14 Will Reportedly Support All iPhone Models Running iOS 13                                        
2     iPhone Looters Being Tracked – Apple Warns Phone Thiefs                                                
4     French govt's StopCovid tracing app debuts on Google Play store                                        
6     iOS 13.5.1 vs iOS 13.5.5 beta 1 speed test (Video)                                                     
8     Apple TV Users Can Now Enjoy YouTube Kids                                                              
9     Tech giants condemn racial discrimination, George Floyd death                                          
10    Looters find that iPhones stolen from Apple Stores are bricked - news                                  
12    Why are Apple Pay, Starbucks' app, and Samsung Pay so much more successful than other wallet provider