In [1]:
import numpy as np
import pandas as pd
import glob
import json

from collections import Counter
from tqdm import tqdm

from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

from gensim.models import KeyedVectors
#from gensim.models import fasttext

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sidthakur08/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
file = 1
paths = glob.glob(f'./articles_data/{file}/*.json')
paths = paths[:1000]

In [4]:
article_data = []
for path in paths:
    with open(path) as f:
        article_data.append(json.load(f))

In [5]:
article_data[0].keys()

dict_keys(['organizations', 'uuid', 'thread', 'author', 'url', 'ord_in_thread', 'title', 'locations', 'entities', 'highlightText', 'language', 'persons', 'text', 'external_links', 'published', 'crawled', 'highlightTitle'])

In [6]:
len(article_data)

1000

In [7]:
sections = []
for i in range(1000):
    sections.append(article_data[i]['thread']['section_title'])
#Counter(sections)

In [8]:
article_data[0]

{'organizations': [],
 'uuid': 'ea5c49d229ded1a94679a075702e79d3a0a8848e',
 'thread': {'social': {'gplus': {'shares': 0},
   'pinterest': {'shares': 0},
   'vk': {'shares': 0},
   'linkedin': {'shares': 0},
   'facebook': {'likes': 0, 'shares': 0, 'comments': 0},
   'stumbledupon': {'shares': 0}},
  'site_full': 'www.businesswire.com',
  'main_image': '',
  'site_section': 'http://feed.businesswire.com/rss/home/?rss=G1QFDERJXkJeGVtQXg==',
  'section_title': 'Business Wire Technology: Mobile/Wireless News',
  'url': 'http://www.businesswire.com/news/home/20150921006305/en/MobileSmith-4.0-Leap-Enterprise-App-Development-REST',
  'country': 'US',
  'title': 'MobileSmith 4.0 a Leap Forward in Enterprise App Development with REST Services, OAuth',
  'performance_score': 0,
  'site': 'businesswire.com',
  'participants_count': 0,
  'title_full': 'MobileSmith 4.0 a Leap Forward in Enterprise App Development with REST Services, OAuth',
  'spam_score': 0.0,
  'site_type': 'news',
  'published':

In [9]:
stopwords_eng = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def process_text(text):
    text = text.replace("\n"," ").replace("\r"," ")
    
    t = str.maketrans(dict.fromkeys("'`",""))
    text = text.translate(t)
    
    tokens = regexp_tokenize(text,pattern='\s+',gaps=True)
    cleaned_tokens = []
    
    for t in tokens:
        if t not in stopwords_eng and t not in string.punctuation:
            l = lemmatizer.lemmatize(t)
            cleaned_tokens.append(l)
    
    return cleaned_tokens

In [10]:
%timeit process_text("Technology News | afr.com Business backs Malcolm Turnbull's 'fresh' look at reform | afr.com")

143 µs ± 24.2 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
data = []
t = 0
for i in tqdm(range(len(article_data))):
    data.append({
        'full_title':article_data[i]['thread']['section_title']+' '+article_data[i]['thread']['title_full'],
        'url':article_data[i]['thread']['url'],
        'title_tokens':process_text(article_data[i]['thread']['section_title']+' '+article_data[i]['thread']['title_full'])
        }
    )

100%|██████████| 1000/1000 [00:00<00:00, 5423.11it/s]


In [12]:
model = KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin",binary=True,limit=10**5)

In [13]:
def get_vec(word):
    try:
        return model[word]
    except:
        return np.zeros(300)

In [14]:
sent_vector = dict()
for i in tqdm(range(len(data))):
    sent_vector[data[i]['full_title']] = sum([get_vec(t) for t in data[i]['title_tokens']])

100%|██████████| 1000/1000 [00:00<00:00, 12739.16it/s]


In [15]:
sent_vector[data[0]['full_title']]

array([-0.12487793, -0.79882812, -0.34216309, -0.84890747,  0.34338379,
       -0.20715332,  0.68847656,  1.23150635,  0.33236694, -0.35009766,
       -0.18615723,  0.32080078,  0.39294434,  0.05737305, -0.52734375,
       -0.52929688, -0.18725586, -0.16235352, -0.5357666 , -0.2387085 ,
       -0.25506592,  0.92993164,  0.25476074,  0.3475647 ,  0.68164062,
        1.28857422, -0.10058594,  1.26928711,  1.08102417, -0.39434814,
       -0.41674805, -0.35876465,  0.51049805, -1.35986328,  1.24523926,
       -0.90014648,  0.06381226, -0.64672852, -1.69421387,  1.26416016,
       -0.22802734, -0.59667969, -0.68383789,  0.59912109, -0.02148438,
       -1.42077637, -0.66162109, -1.03808594,  0.7175293 ,  1.26855469,
       -1.38739014,  0.41412354,  0.08166504,  1.1343689 ,  0.11315918,
        0.85717773, -1.20666504,  0.04534912, -0.48803711, -0.54858398,
       -0.0390625 ,  0.04650879, -1.5859375 ,  0.10693359, -0.99963379,
        0.25912476, -0.29711914,  0.75634766,  0.29107666,  0.66