In [1]:
import numpy as np
import pandas as pd
import glob
import json

from collections import Counter
from tqdm import tqdm

from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

from gensim.models import KeyedVectors
#from gensim.models import fasttext

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sidthakur08/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
file = 1
paths = glob.glob(f'./articles_data/{file}/*.json')
paths = paths[:1000]

In [4]:
article_data = []
for path in paths:
    with open(path) as f:
        article_data.append(json.load(f))

In [5]:
article_data[0].keys()

dict_keys(['organizations', 'uuid', 'thread', 'author', 'url', 'ord_in_thread', 'title', 'locations', 'entities', 'highlightText', 'language', 'persons', 'text', 'external_links', 'published', 'crawled', 'highlightTitle'])

In [6]:
len(article_data)

1000

In [7]:
sections = []
for i in range(1000):
    sections.append(article_data[i]['thread']['section_title'])
#Counter(sections)

In [8]:
article_data[0]

{'organizations': [],
 'uuid': 'ea5c49d229ded1a94679a075702e79d3a0a8848e',
 'thread': {'social': {'gplus': {'shares': 0},
   'pinterest': {'shares': 0},
   'vk': {'shares': 0},
   'linkedin': {'shares': 0},
   'facebook': {'likes': 0, 'shares': 0, 'comments': 0},
   'stumbledupon': {'shares': 0}},
  'site_full': 'www.businesswire.com',
  'main_image': '',
  'site_section': 'http://feed.businesswire.com/rss/home/?rss=G1QFDERJXkJeGVtQXg==',
  'section_title': 'Business Wire Technology: Mobile/Wireless News',
  'url': 'http://www.businesswire.com/news/home/20150921006305/en/MobileSmith-4.0-Leap-Enterprise-App-Development-REST',
  'country': 'US',
  'title': 'MobileSmith 4.0 a Leap Forward in Enterprise App Development with REST Services, OAuth',
  'performance_score': 0,
  'site': 'businesswire.com',
  'participants_count': 0,
  'title_full': 'MobileSmith 4.0 a Leap Forward in Enterprise App Development with REST Services, OAuth',
  'spam_score': 0.0,
  'site_type': 'news',
  'published':

In [9]:
stopwords_eng = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def process_text(text):
    text = text.replace("\n"," ").replace("\r"," ")
    
    punc_list = '!"#$%()*+,-./:;<=>?@^_{|}~'
    t = str.maketrans(dict.fromkeys(punc_list," "))
    text = text.translate(t)
    
    t = str.maketrans(dict.fromkeys("'`",""))
    text = text.translate(t)
    
    tokens = regexp_tokenize(text,pattern='\s+',gaps=True)
    cleaned_tokens = []
    
    for t in tokens:
        if t not in stopwords_eng:
            l = lemmatizer.lemmatize(t)
            cleaned_tokens.append(l)
    
    return cleaned_tokens

In [10]:
%timeit process_text("Technology News | afr.com Business backs Malcolm Turnbull's 'fresh' look at reform | afr.com")

The slowest run took 4.24 times longer than the fastest. This could mean that an intermediate result is being cached.
325 µs ± 197 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
model = KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin",binary=True,limit=10**5)

In [12]:
def get_vec(word):
    try:
        return model[word]
    except:
        return np.zeros(300)

In [13]:
final_data = []
for i in tqdm(range(len(article_data))):
    full_title = article_data[i]['thread']['section_title']+' '+article_data[i]['thread']['title_full']
    url = article_data[i]['thread']['url'],
    tokens = process_text(full_title)
    vector = sum([get_vec(t) for t in tokens]).tolist()
    final_data.append({
        'full_title': full_title,
        'url': url,
        'title_tokens': tokens,
        'sentence_vector': vector
    }
    )

100%|██████████| 1000/1000 [00:00<00:00, 2406.70it/s]


In [14]:
final_data[0]

{'full_title': 'Business Wire Technology: Mobile/Wireless News MobileSmith 4.0 a Leap Forward in Enterprise App Development with REST Services, OAuth',
 'url': ('http://www.businesswire.com/news/home/20150921006305/en/MobileSmith-4.0-Leap-Enterprise-App-Development-REST',),
 'title_tokens': ['Business',
  'Wire',
  'Technology',
  'Mobile',
  'Wireless',
  'News',
  'MobileSmith',
  '4',
  '0',
  'Leap',
  'Forward',
  'Enterprise',
  'App',
  'Development',
  'REST',
  'Services',
  'OAuth'],
 'sentence_vector': [-0.1695556640625,
  -2.09326171875,
  -0.3275146484375,
  -1.363800048828125,
  -0.3109130859375,
  0.042724609375,
  1.2889423370361328,
  1.23516845703125,
  0.711273193359375,
  -0.2864990234375,
  -0.23089599609375,
  0.987762451171875,
  0.3082275390625,
  -0.448486328125,
  -1.177734375,
  -1.1090087890625,
  0.580810546875,
  -0.09619140625,
  -0.97698974609375,
  0.28033447265625,
  -0.08795166015625,
  1.004425048828125,
  0.289337158203125,
  0.253570556640625,
  0.