In [1]:
import pandas as pd
import json
import numpy as np
from tqdm import tqdm

import string
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import KeyedVectors

In [2]:
data = pd.read_csv('data.csv')
stopwords_eng = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
data.head()

Unnamed: 0,url,headline,summary
0,https://www.independent.co.uk/news/world/europ...,Non-stunned halal and kosher meat not saleable...,Animals must be stunned prior to being killed ...
1,https://techxplore.com/news/2018-07-bacteria-p...,Solar cells powered by hybrid E. coli convert ...,Researchers in Canada have developed an innova...
2,http://www.themalaymailonline.com/malaysia/art...,Southeast Asia unprepared for ISIS attacks say...,Southeast Asia is unprepared for the rapidly-r...
3,http://www.scotsman.com/business/companies/ret...,Speedy Hire rejects calls for merger with HSS,In addition to calls from Toscafund to oust it...
4,http://www.wsj.com/articles/transferwise-plans...,TransferWise to launch China services\n,TransferWise has announced plans to launch a c...


In [3]:
data.shape[0]

40000

In [4]:
data = data.head(1000)

In [5]:
data.shape

(1000, 3)

In [6]:
def process_text(text):
    text = text.replace("\n"," ").replace("\r"," ")
    text = text.replace("\xa0"," ")

    punc_list = '!"#$%()*+,-./:;<=>?@^_{|}~'
    t = str.maketrans(dict.fromkeys(punc_list," "))
    text = text.translate(t)

    t = str.maketrans(dict.fromkeys("'`",""))
    text = text.translate(t)

    tokens = regexp_tokenize(text,pattern='\s+',gaps=True)
    cleaned_tokens = []

    for t in tokens:
        if t not in stopwords_eng:
            l = lemmatizer.lemmatize(t)
            cleaned_tokens.append(l)

    return cleaned_tokens

def get_vec(word):
    try:
        return model[word]
    except:
        return np.zeros(300)

In [7]:
model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin',binary=True,limit=10**5)

In [8]:
data.isna().sum()

url         0
headline    1
summary     2
dtype: int64

In [9]:
data = data.dropna()

In [10]:
data.head(5)

Unnamed: 0,url,headline,summary
0,https://www.independent.co.uk/news/world/europ...,Non-stunned halal and kosher meat not saleable...,Animals must be stunned prior to being killed ...
1,https://techxplore.com/news/2018-07-bacteria-p...,Solar cells powered by hybrid E. coli convert ...,Researchers in Canada have developed an innova...
2,http://www.themalaymailonline.com/malaysia/art...,Southeast Asia unprepared for ISIS attacks say...,Southeast Asia is unprepared for the rapidly-r...
3,http://www.scotsman.com/business/companies/ret...,Speedy Hire rejects calls for merger with HSS,In addition to calls from Toscafund to oust it...
4,http://www.wsj.com/articles/transferwise-plans...,TransferWise to launch China services\n,TransferWise has announced plans to launch a c...


In [11]:
data.iloc[0]['url']

'https://www.independent.co.uk/news/world/europe/halal-kosher-meat-organic-stunning-eu-court-ruling-a8797761.html'

In [12]:
data_dict = []
for i in tqdm(range(data.shape[0])):
    data_dict.append({
        'url':data.iloc[i]['url'],
        'headline':data.iloc[i]['headline'],
        'tokens':process_text(data.iloc[i]['headline']),
        'sentence_vector':sum([get_vec(t) for t in process_text(data.iloc[i]['headline'])]).tolist()
    })

100%|██████████| 997/997 [00:12<00:00, 81.53it/s] 


In [13]:
print(data_dict[:1])

[{'url': 'https://www.independent.co.uk/news/world/europe/halal-kosher-meat-organic-stunning-eu-court-ruling-a8797761.html', 'headline': 'Non-stunned halal and kosher meat not saleable as organic: ECJ', 'tokens': ['Non', 'stunned', 'halal', 'kosher', 'meat', 'saleable', 'organic', 'ECJ'], 'sentence_vector': [-0.8790283203125, -0.692626953125, 1.00634765625, 0.5364990234375, -1.270263671875, 0.47216796875, -0.0860595703125, -1.35009765625, -0.19671630859375, 0.77069091796875, 0.7327880859375, -1.1787109375, -0.749725341796875, 0.36083984375, -1.71771240234375, 0.27362060546875, 0.669921875, 0.5650634765625, 0.34503173828125, 0.05712890625, 1.5106201171875, 0.375, 0.765625, 0.29473876953125, -0.5670166015625, -0.27925872802734375, -0.9154052734375, -0.0208740234375, 1.0550537109375, 0.08502197265625, -0.267333984375, -0.8868408203125, 0.73095703125, 0.0072021484375, -0.50311279296875, -0.3544921875, 0.9013214111328125, 0.34814453125, 0.536865234375, -0.2607421875, 0.5683135986328125, -0.

In [14]:
pd.DataFrame(data_dict).head(10)

Unnamed: 0,headline,sentence_vector,tokens,url
0,Non-stunned halal and kosher meat not saleable...,"[-0.8790283203125, -0.692626953125, 1.00634765...","[Non, stunned, halal, kosher, meat, saleable, ...",https://www.independent.co.uk/news/world/europ...
1,Solar cells powered by hybrid E. coli convert ...,"[-0.28662109375, 0.886474609375, 0.73211669921...","[Solar, cell, powered, hybrid, E, coli, conver...",https://techxplore.com/news/2018-07-bacteria-p...
2,Southeast Asia unprepared for ISIS attacks say...,"[0.013671875, 0.665771484375, 0.24090576171875...","[Southeast, Asia, unprepared, ISIS, attack, sa...",http://www.themalaymailonline.com/malaysia/art...
3,Speedy Hire rejects calls for merger with HSS,"[-0.567596435546875, 0.98150634765625, -0.0351...","[Speedy, Hire, reject, call, merger, HSS]",http://www.scotsman.com/business/companies/ret...
4,TransferWise to launch China services\n,"[-0.07177734375, 0.197509765625, -0.044921875,...","[TransferWise, launch, China, service]",http://www.wsj.com/articles/transferwise-plans...
5,Pollution and other environmental factor linke...,"[0.11328125, 0.444091796875, 0.038375854492187...","[Pollution, environmental, factor, linked, dem...",http://www.standard.co.uk/news/uk/londoners-at...
6,Soft Silhouette is used to treat sagging skin,"[0.243408203125, 0.9223136901855469, 0.1621093...","[Soft, Silhouette, used, treat, sagging, skin]",http://www.vanitatis.elconfidencial.com/estilo...
7,Japan's new law bans bias against disabilities,"[-0.12664794921875, -0.4520263671875, 0.567382...","[Japans, new, law, ban, bias, disability]",http://www.japantimes.co.jp/news/2016/05/02/re...
8,San Francisco restaurants serve up more co-wor...,"[-0.68701171875, -0.43695068359375, 0.73187255...","[San, Francisco, restaurant, serve, co, workin...",https://www.msn.com/en-ph/foodanddrink/foodnew...
9,UK councils tripled overseas hires of social ...,"[-0.234375, -0.016143798828125, -0.27863311767...","[UK, council, tripled, overseas, hire, social,...",http://www.communitycare.co.uk/2017/07/20/coun...
