In [1]:
import os
import json
import numpy as np
import pandas as pd
import sqlite3
from nltk.corpus import stopwords

import faiss
from elasticsearch import helpers, Elasticsearch

In [2]:
stopwords_de = stopwords.words('german')
stopwords_en = stopwords.words('english')

stopwords_all = stopwords_de + stopwords_en
stopwords_all = list(set(stopwords_all))

In [3]:
def get_label_name(label):
    
    if label == 1:
        return 'technology'
    elif label == 2:
        return 'military'
    
def get_modified_vectors(vec_data):
    
    new_data = []
    for val in vec_data:
        new_data.append(val)
    
    new_data = np.array(new_data).reshape(-1, 512)
    return new_data

def doc_actions(data):

    for row in data:
        yield {
                '_index': index,
                'pipeline': pipeline,
                '_source': row,
                '_id': row['id']
            }

In [4]:
tech_df = pd.read_pickle(os.getcwd() + '/../dataframes/tech_df_final.pkl')
tech_df = tech_df.drop('milt_label', axis=1)

milt_df = pd.read_pickle(os.getcwd() + '/../dataframes/milt_df_final.pkl')

rssitem_df = pd.read_pickle(os.getcwd()+'/../dataframes/rss_cache_df.pkl')

In [5]:
len(rssitem_df.index)

510469

In [121]:
tech_df = tech_df.rename(columns={'tech_label': 'label'})
milt_df = milt_df.rename(columns={'milt_label': 'label'})

In [122]:
df = pd.concat([tech_df, milt_df])
df.head(2)

Unnamed: 0,id,text,text_len,lang,text_tokens,nc_vec,label
1,210705_news_328064.txt,"PRESS STATEMENT \nMICHAEL R. POMPEO, SECRETARY...",320,en,"[PRESS STATEMENT \nMICHAEL R. POMPEO, SECRETAR...","[[-0.004804640542715788, 1.7863454559119418e-0...",1
28,210705_news_476111.txt,Thanks to a two-year grant commitment from Blo...,483,en,"[a two-year grant commitment, Bloomberg, our s...","[[-0.00989371258765459, -0.0012395973317325115...",1


In [123]:
df.label.value_counts()

1    9574
2    1939
Name: label, dtype: int64

In [124]:
df['label_name'] = df.apply(lambda x:get_label_name(x['label']), axis=1)
df['id'] = df.apply(lambda x:x['id'][:-4], axis=1)

In [125]:
df = df.merge(rssitem_df, how='left', on='id')

In [126]:
df['pubDate'] = pd.to_datetime(df['pubDate'],unit='s')
df.sample(3)

Unnamed: 0,id,text,text_len,lang,text_tokens,nc_vec,label,label_name,title,pubDate,url
742,210705_news_13798,Roll up! Roll up! The world’s biggest climate ...,1133,en,"[the world’s big climate polluter, the world’s...","[[-0.0036311428993940353, -0.02412171103060245...",1,technology,Saudi Aramco IPO: the ultimate marriage betwee...,2019-11-03 12:49:41,https://www.theguardian.com/business/2019/nov/...
4337,210705_news_227416,"A large, multinational technology company got ...",516,en,"[a large, multinational technology company, a ...","[[-0.01047287043184042, -0.00840772595256567, ...",1,technology,Chinese bank requires foreign firm to install ...,2020-06-26 12:00:49,https://arstechnica.com/?p=1687527
4446,210705_news_198526,Questions have been raised by Australian infec...,1310,en,"[question, australian infectious disease resea...","[[-0.010538739152252674, 0.0062672751955688, -...",1,technology,Questions raised over hydroxychloroquine study...,2020-05-28 02:27:34,https://www.theguardian.com/science/2020/may/2...


In [25]:
# df.to_pickle(os.getcwd()+'/../dataframes/final_dataframe.pkl')
df = pd.read_pickle(os.getcwd()+'/../dataframes/final_dataframe.pkl')

In [35]:
df['autocomplete'] = df['text']

## Elastic search index creation and ingestion

In [27]:
index = 'xxx_scraped_docs'
pipeline = 'multilang_pipe'

username = 'elastic'
password = 'mit22yyy!'

hostname = 'localhost'
port = '9200'

es = Elasticsearch(f"http://{username}:{password}@{hostname}:{port}")

In [36]:
with open('config/lpf_index_mappings.json', 'r') as f:
    index_mapping_lpf = json.load(f)
    
with open('config/lpf_pipeline.json', 'r') as f:
    pipeline_lpf = json.load(f)

In [37]:
es.indices.delete(index=index, ignore=404)
es.indices.create(index=index, body=index_mapping_lpf)

es.ingest.delete_pipeline(id=pipeline, ignore=404)
es.ingest.put_pipeline(id=pipeline, body=pipeline_lpf)

  """Entry point for launching an IPython kernel.
  
  after removing the cwd from sys.path.
  """


ObjectApiResponse({'acknowledged': True})

In [38]:
df_es = df[['id', 'text', 'label_name', 'title', 'pubDate', 'url', 'autocomplete']]
df_es = df_es.rename(columns={'text': 'contents', 'label_name':'label', 'pubDate':'published_date', 'url':'page_url'})
doc_dict = df_es.to_dict(orient='records')

In [39]:
helpers.bulk(es, doc_actions(doc_dict), chunk_size=1000, request_timeout=600, refresh='wait_for')

  """Entry point for launching an IPython kernel.


(11513, [])

In [34]:
es.indices.refresh(index=index)

ObjectApiResponse({'_shards': {'total': 1, 'successful': 1, 'failed': 0}})

## Faiss index creation and ingestion

In [14]:
doc_embeddings = get_modified_vectors(df.nc_vec.values)
doc_embeddings = np.float32(doc_embeddings)
doc_embeddings.shape

(11513, 512)

In [15]:
index = faiss.IndexFlatL2(doc_embeddings.shape[1])

In [16]:
index.add(doc_embeddings)

In [17]:
faiss.write_index(index,os.getcwd() + '/../../models/vector.index')

In [18]:
result=index.search(doc_embeddings[0].reshape(1, -1), 3)
result

(array([[0.        , 0.0164094 , 0.01658845]], dtype=float32),
 array([[   0, 2386, 9180]], dtype=int64))

In [19]:
result[1][0]

array([   0, 2386, 9180], dtype=int64)

In [20]:
df.label.value_counts()

1    9574
2    1939
Name: label, dtype: int64

In [21]:
df.iloc[result[1][0]]

Unnamed: 0,id,text,text_len,lang,text_tokens,nc_vec,label,label_name,title,pubDate,url
0,210705_news_328064,"PRESS STATEMENT \nMICHAEL R. POMPEO, SECRETARY...",320,en,"[PRESS STATEMENT \nMICHAEL R. POMPEO, SECRETAR...","[[-0.004804640542715788, 1.7863454559119418e-0...",1,technology,United States Charges Russian Military Intelli...,2020-10-20 07:53:39,https://ru.usembassy.gov/united-states-charges...
2386,210705_news_327019,The alleged cyberattackers hacked into softwar...,884,en,"[the allege cyberattacker, software, destructi...","[[-0.007118138950318098, 0.0011028514709323645...",1,technology,6 Russian military officers charged with a wor...,2020-10-19 18:08:00,https://www.cnn.com/2020/10/19/politics/russia...
9180,210705_news_327510,WASHINGTON - U.S. prosecutors on Monday anno...,942,en,"[WASHINGTON - U.S. prosecutor, Monday, charge...","[[-0.0029839028138667345, 0.005981412716209888...",1,technology,US Charges Six Russian Military Officers in Gl...,2020-10-19 23:26:15,https://www.voanews.com/usa/us-charges-six-rus...
