In [1]:
import pandas as pd
import json
import numpy as np
from tqdm import tqdm

import string
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import KeyedVectors

In [2]:
data = pd.read_csv('data/data.csv')
stopwords_eng = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
data.head()

Unnamed: 0,url,headline,summary
0,https://www.independent.co.uk/news/world/europ...,Non-stunned halal and kosher meat not saleable...,Animals must be stunned prior to being killed ...
1,https://techxplore.com/news/2018-07-bacteria-p...,Solar cells powered by hybrid E. coli convert ...,Researchers in Canada have developed an innova...
2,http://www.themalaymailonline.com/malaysia/art...,Southeast Asia unprepared for ISIS attacks say...,Southeast Asia is unprepared for the rapidly-r...
3,http://www.scotsman.com/business/companies/ret...,Speedy Hire rejects calls for merger with HSS,In addition to calls from Toscafund to oust it...
4,http://www.wsj.com/articles/transferwise-plans...,TransferWise to launch China services\n,TransferWise has announced plans to launch a c...


In [3]:
data.shape[0]

40000

In [4]:
data = data.head(1000)

In [5]:
data.shape

(1000, 3)

In [6]:
def process_text(text):
    text = text.replace("\n"," ").replace("\r"," ")
    text = text.replace("\xa0"," ")

    punc_list = '!"#$%()*+,-./:;<=>?@^_{|}~'
    t = str.maketrans(dict.fromkeys(punc_list," "))
    text = text.translate(t)

    t = str.maketrans(dict.fromkeys("'`",""))
    text = text.translate(t)

    tokens = regexp_tokenize(text,pattern='\s+',gaps=True)
    cleaned_tokens = []

    for t in tokens:
        if t not in stopwords_eng:
            l = lemmatizer.lemmatize(t)
            cleaned_tokens.append(l)

    return cleaned_tokens

def get_vec(word):
    try:
        return model[word]
    except:
        return np.zeros(300)

In [7]:
model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin',binary=True,limit=10**5)

In [8]:
data.isna().sum()

url         0
headline    1
summary     2
dtype: int64

In [9]:
data = data.dropna()

In [10]:
data.head(5)

Unnamed: 0,url,headline,summary
0,https://www.independent.co.uk/news/world/europ...,Non-stunned halal and kosher meat not saleable...,Animals must be stunned prior to being killed ...
1,https://techxplore.com/news/2018-07-bacteria-p...,Solar cells powered by hybrid E. coli convert ...,Researchers in Canada have developed an innova...
2,http://www.themalaymailonline.com/malaysia/art...,Southeast Asia unprepared for ISIS attacks say...,Southeast Asia is unprepared for the rapidly-r...
3,http://www.scotsman.com/business/companies/ret...,Speedy Hire rejects calls for merger with HSS,In addition to calls from Toscafund to oust it...
4,http://www.wsj.com/articles/transferwise-plans...,TransferWise to launch China services\n,TransferWise has announced plans to launch a c...


In [11]:
data.iloc[0]['url']

'https://www.independent.co.uk/news/world/europe/halal-kosher-meat-organic-stunning-eu-court-ruling-a8797761.html'

In [12]:
data_dict = []
for i in tqdm(range(data.shape[0])):
    data_dict.append({
        'url':data.iloc[i]['url'],
        'headline':data.iloc[i]['headline'],
        'tokens':process_text(data.iloc[i]['headline']),
        'sentence_vector':sum([get_vec(t) for t in process_text(data.iloc[i]['headline'])]).tolist()
    })

100%|██████████| 997/997 [00:03<00:00, 286.71it/s]


In [13]:
df = pd.DataFrame(data_dict)

In [14]:
for i in tqdm(range(df.shape[0])):
    try:
        if (df.iloc[i]['sentence_vector'] == np.zeros(300)).all():
            df = df.drop([i],axis=0)
    except Exception as e:
        print(e)

100%|██████████| 997/997 [00:00<00:00, 3938.91it/s]

single positional indexer is out-of-bounds





In [15]:
df.iloc[252]

headline           WeWork veterans open wellness-minded co-workin...
sentence_vector    [-0.299072265625, 0.4761962890625, 0.149353027...
tokens             [WeWork, veteran, open, wellness, minded, co, ...
url                http://www.harpersbazaar.com/beauty/health/a16...
Name: 253, dtype: object

In [16]:
pd.DataFrame(data_dict).head(5)

Unnamed: 0,headline,sentence_vector,tokens,url
0,Non-stunned halal and kosher meat not saleable...,"[-0.8790283203125, -0.692626953125, 1.00634765...","[Non, stunned, halal, kosher, meat, saleable, ...",https://www.independent.co.uk/news/world/europ...
1,Solar cells powered by hybrid E. coli convert ...,"[-0.28662109375, 0.886474609375, 0.73211669921...","[Solar, cell, powered, hybrid, E, coli, conver...",https://techxplore.com/news/2018-07-bacteria-p...
2,Southeast Asia unprepared for ISIS attacks say...,"[0.013671875, 0.665771484375, 0.24090576171875...","[Southeast, Asia, unprepared, ISIS, attack, sa...",http://www.themalaymailonline.com/malaysia/art...
3,Speedy Hire rejects calls for merger with HSS,"[-0.567596435546875, 0.98150634765625, -0.0351...","[Speedy, Hire, reject, call, merger, HSS]",http://www.scotsman.com/business/companies/ret...
4,TransferWise to launch China services\n,"[-0.07177734375, 0.197509765625, -0.044921875,...","[TransferWise, launch, China, service]",http://www.wsj.com/articles/transferwise-plans...
