In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import itertools
import string
import re
import io
import ssl
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch.connection import create_ssl_context
import urllib3
urllib3.disable_warnings()

open_distro_ssl_context = create_ssl_context()
# next two lines are if you're running localhost with a self-signed cert (aka docker)
open_distro_ssl_context.check_hostname = False
open_distro_ssl_context.verify_mode = ssl.CERT_NONE

es = Elasticsearch(
    scheme="https",
    hosts=[ { 'port': 9200, 'host': 'localhost' } ],
    ssl_context=open_distro_ssl_context,
    http_auth=("admin", "admin"), 
    timeout=30, 
    max_retries=10, 
    retry_on_timeout=True
)

# repalce the path with your json data file
df = pd.read_json("./dblp-extrait/dblp-extrait.json", lines=True)
# df.head()

In [2]:
# this var should be the col you want to use for autocompletion
col_to_keep = ["abstract", "title"]

for c in df.columns:
    if c not in col_to_keep:
        print(c, " removed")
        df = df.drop(columns=c)
df.dropna(inplace=True)
# df.head()

authors  removed
n_citation  removed
references  removed
venue  removed
year  removed
id  removed


In [3]:
df.size

50662

In [4]:
'''
import inflect

# used for semantic preprocessing
p = inflect.engine()

# kind of long maybe find another solution ?
def singularize(word):
    a = p.singular_noun(word)
    return a if a is not False else word
'''

# faster but less semantic
'''
def singularize(word):
    return word[:-1] if len(word) > 0 and word[-1] == 's' and (len(word) > 2 and word[-2] != 'i') else word
'''

"\ndef singularize(word):\n    return word[:-1] if len(word) > 0 and word[-1] == 's' and (len(word) > 2 and word[-2] != 'i') else word\n"

In [5]:
# TODO use a list of common word by language (like the, for ...) and remove them from the string in this function
def clean_text(text):
    '''
    take a str and return a preprocessed str
    current preprocessing are :
    lowercase
    punctuation removal
    singularization -> not anymore too long and dont give good results
    '''
    words = text.split()
    stripped =  [re.sub(r"[,.;@#?!&$-]+\ *", " ", w).lower() for w in words]
    res = " ".join(stripped)
    return res

In [6]:

corpus = []
pbar = tqdm(total=df.size // 2)
for item in df.iterrows():
    item = item[1]
    corpus.append((clean_text(item[1]), clean_text(item[0])))

    pbar.update()
pbar.close()

100%|██████████| 25331/25331 [00:10<00:00, 2303.13it/s]


In [7]:
print(corpus[0])

('preliminary design of a network protocol learning tool based on the comprehension of high school students: design by an empirical study using a simple mind map', 'the purpose of this study is to develop a learning tool for high school students studying the scientific aspects of information and communication net  works  more specifically  we focus on the basic principles of network proto  cols as the aim to develop our learning tool  our tool gives students hands on experience to help understand the basic principles of network protocols ')


In [8]:
print(corpus[256])

('loose particle classification using a new wavelet fisher discriminant method', 'loose particles left inside aerospace components or equipment can cause catastrophic failure in aerospace industry  it is vital to identify the material type of these loose particles and eliminate them  this is a classification problem  and autoregressive (ar) model and learning vector quantization (lvq) networks have been used to classify loose particles inside components  more recently  the test objects have been changed from components to aerospace equipments  to improve classification accuracy  more data samples often have to be dealt with  the difficulty is that these data samples contain redundant information  and the aforementioned two conventional methods are unable to process redundant information  thus the classification accuracy is deteriorated  in this paper  the wavelet fisher discriminant is investigated for loose particle classifications  first  the fisher model is formulated as a least squ

In [9]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id
utils = ["embedd", "id2word", "word2id"]
ln_supported = ["en", "de", "es", "fr"]

lang_utils = { ln: {utils[i]: d for i, d in enumerate(load_vec(f'wiki.multi.{ln}.vec'))} for ln in tqdm(ln_supported)}

100%|██████████| 4/4 [00:14<00:00,  3.66s/it]


In [10]:
import nltk
nltk.download('stopwords')
from langdetect import detect
from nltk.corpus import stopwords 

stop_words = {"en": set(stopwords.words('english')), "es": set(stopwords.words('spanish')),"de": set(stopwords.words('german')),"fr": set(stopwords.words('french')),} 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/qfeuilla/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
import fasttext

PRETRAINED_MODEL_PATH = 'lid.176.bin'
model = fasttext.load_model(PRETRAINED_MODEL_PATH)

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
def dist(a, b):
    return cosine_similarity(a, b)

In [14]:
'''
def vectorize_v1(stop_words, ln_supported, lang_model, lang_utils, sent, perc_to_remove=0.1):
    embedds = []
    lang = lang_model.predict([sent])[0][0][0][-2:]
    if lang not in ln_supported:
        return np.zeros((600))
    word2id = lang_utils[lang]["word2id"]
    embedd = lang_utils[lang]["embedd"]
    words = sent.split()
    while len(words) and words[0] not in word2id:
        words = words[1:]
    if len(words) == 0:
        return np.zeros((600))
    ew = embedd[word2id[words[0]]]
    ew2 = None
    for i in range(1, len(words)):
        if words[i] not in stop_words[lang]:
            try:
                ew2 = embedd[word2id[words[i]]]
                embedds.append(np.concatenate([ew, ew2]))
                embedds.append(np.concatenate([ew2, ew]))
                ew = ew2
            except:
                None
    
    ew = embedd[word2id[words[0]]]
    if len(embedds) == 0:
        embedds.append(np.concatenate([ew, ew]))
    centroid = np.sum(embedds, axis=0) / len(embedds)
    return centroid
'''

'\ndef vectorize_v1(stop_words, ln_supported, lang_model, lang_utils, sent, perc_to_remove=0.1):\n    embedds = []\n    lang = lang_model.predict([sent])[0][0][0][-2:]\n    if lang not in ln_supported:\n        return np.zeros((600))\n    word2id = lang_utils[lang]["word2id"]\n    embedd = lang_utils[lang]["embedd"]\n    words = sent.split()\n    while len(words) and words[0] not in word2id:\n        words = words[1:]\n    if len(words) == 0:\n        return np.zeros((600))\n    ew = embedd[word2id[words[0]]]\n    ew2 = None\n    for i in range(1, len(words)):\n        if words[i] not in stop_words[lang]:\n            try:\n                ew2 = embedd[word2id[words[i]]]\n                embedds.append(np.concatenate([ew, ew2]))\n                embedds.append(np.concatenate([ew2, ew]))\n                ew = ew2\n            except:\n                None\n    \n    ew = embedd[word2id[words[0]]]\n    if len(embedds) == 0:\n        embedds.append(np.concatenate([ew, ew]))\n    centroid 

In [15]:
'''
# add outlier removal
def vectorize_v2(stop_words, ln_supported, lang_model, lang_utils, sent, perc_to_remove=0.1):
    embedds = []
    lang = lang_model.predict([sent])[0][0][0][-2:]
    if lang not in ln_supported:
        return np.zeros((600))
    word2id = lang_utils[lang]["word2id"]
    embedd = lang_utils[lang]["embedd"]
    words = sent.split()
    while len(words) and words[0] not in word2id:
        words = words[1:]
    if len(words) == 0:
        return np.zeros((600))
    ew = embedd[word2id[words[0]]]
    ew2 = None
    for i in range(1, len(words)):
        if words[i] not in stop_words[lang]:
            try:
                ew2 = embedd[word2id[words[i]]]
                embedds.append(np.concatenate([ew, ew2]))
                embedds.append(np.concatenate([ew2, ew]))
                ew = ew2
            except:
                None
    
    ew = embedd[word2id[words[0]]]
    if len(embedds) == 0:
        embedds.append(np.concatenate([ew, ew]))
    centroid = np.sum(embedds, axis=0) / len(embedds) 
    if len(embedds) * perc_to_remove >= 1:
        embedds = np.array(embedds)
        embedds = sorted(embedds.tolist(), key=lambda x: -np.array(dist([x], [centroid]))[0][0])
        centroid = np.sum(embedds[:-int(len(embedds) * perc_to_remove)], axis=0) / len(embedds) 
    return centroid
'''

'\n# add outlier removal\ndef vectorize_v2(stop_words, ln_supported, lang_model, lang_utils, sent, perc_to_remove=0.1):\n    embedds = []\n    lang = lang_model.predict([sent])[0][0][0][-2:]\n    if lang not in ln_supported:\n        return np.zeros((600))\n    word2id = lang_utils[lang]["word2id"]\n    embedd = lang_utils[lang]["embedd"]\n    words = sent.split()\n    while len(words) and words[0] not in word2id:\n        words = words[1:]\n    if len(words) == 0:\n        return np.zeros((600))\n    ew = embedd[word2id[words[0]]]\n    ew2 = None\n    for i in range(1, len(words)):\n        if words[i] not in stop_words[lang]:\n            try:\n                ew2 = embedd[word2id[words[i]]]\n                embedds.append(np.concatenate([ew, ew2]))\n                embedds.append(np.concatenate([ew2, ew]))\n                ew = ew2\n            except:\n                None\n    \n    ew = embedd[word2id[words[0]]]\n    if len(embedds) == 0:\n        embedds.append(np.concatenate([e

In [16]:
'''
a = vectorize_v2(stop_words, ln_supported, model, lang_utils, "loose particles left inside aerospace components or equipment can cause catastrophic failure in aerospace industry  it is vital to identify the material type of these loose particles and eliminate them  this is a classification problem  and autoregressive (ar) model and learning vector quantization (lvq) networks have been used to classify loose particles inside components  more recently  the test objects have been changed from components to aerospace equipments  to improve classification accuracy  more data samples often have to be dealt with  the difficulty is that these data samples contain redundant information  and the aforementioned two conventional methods are unable to process redundant information  thus the classification accuracy is deteriorated  in this paper  the wavelet fisher discriminant is investigated for loose particle classifications  first  the fisher model is formulated as a least squares problem with linear in the parameters structure  then  the previously proposed two stage subset selection method is used to build a sparse wavelet fisher model in order to reduce redundant information  experimental results show the wavelet fisher classification method can perform better than ar model and lvq networks")
'''

'\na = vectorize_v2(stop_words, ln_supported, model, lang_utils, "loose particles left inside aerospace components or equipment can cause catastrophic failure in aerospace industry  it is vital to identify the material type of these loose particles and eliminate them  this is a classification problem  and autoregressive (ar) model and learning vector quantization (lvq) networks have been used to classify loose particles inside components  more recently  the test objects have been changed from components to aerospace equipments  to improve classification accuracy  more data samples often have to be dealt with  the difficulty is that these data samples contain redundant information  and the aforementioned two conventional methods are unable to process redundant information  thus the classification accuracy is deteriorated  in this paper  the wavelet fisher discriminant is investigated for loose particle classifications  first  the fisher model is formulated as a least squares problem wit

In [17]:
#accelerate v2
def vectorize_v3(stop_words, ln_supported, lang_model, lang_utils, sent, perc_to_remove=0.1):
    embedds = []
    lang = lang_model.predict([sent])[0][0][0][-2:]
    if lang not in ln_supported:
        return np.zeros((600))
    word2id = lang_utils[lang]["word2id"]
    embedd = lang_utils[lang]["embedd"]
    words = sent.split()
    while len(words) and words[0] not in word2id:
        words = words[1:]
    if len(words) == 0:
        return np.zeros((600))
    ew = embedd[word2id[words[0]]]
    ew2 = None
    for i in range(1, len(words)):
        if words[i] not in stop_words[lang]:
            try:
                ew2 = embedd[word2id[words[i]]]
                embedds.append(np.concatenate([ew, ew2]))
                embedds.append(np.concatenate([ew2, ew]))
                ew = ew2
            except:
                None
    
    ew = embedd[word2id[words[0]]]
    if len(embedds) == 0:
        embedds.append(np.concatenate([ew, ew]))
    centroid = np.sum(embedds, axis=0) / len(embedds) 
    if len(embedds) * perc_to_remove >= 1:
        embedds = np.array(embedds)
        sims = np.squeeze(dist(embedds, [centroid]))
        zipped_lists = np.array(sorted(zip(embedds, sims), key=lambda x:x[1]), dtype=np.object)
        embedds = zipped_lists[:, 0]
        centroid = np.sum(embedds[:-int(len(embedds) * perc_to_remove)], axis=0) / len(embedds) 
    return centroid

In [18]:
# a = vectorize_v3(stop_words, ln_supported, model, lang_utils, "loose particles left inside aerospace components or equipment can cause catastrophic failure in aerospace industry  it is vital to identify the material type of these loose particles and eliminate them  this is a classification problem  and autoregressive (ar) model and learning vector quantization (lvq) networks have been used to classify loose particles inside components  more recently  the test objects have been changed from components to aerospace equipments  to improve classification accuracy  more data samples often have to be dealt with  the difficulty is that these data samples contain redundant information  and the aforementioned two conventional methods are unable to process redundant information  thus the classification accuracy is deteriorated  in this paper  the wavelet fisher discriminant is investigated for loose particle classifications  first  the fisher model is formulated as a least squares problem with linear in the parameters structure  then  the previously proposed two stage subset selection method is used to build a sparse wavelet fisher model in order to reduce redundant information  experimental results show the wavelet fisher classification method can perform better than ar model and lvq networks")

In [19]:
def similarity(stop_words, ln_supported, lang_model, lang_utils, s1, s2, prt=True):
    sim = dist([vectorize_v3(stop_words, ln_supported, lang_model, lang_utils, s1)], [vectorize_v3(stop_words, ln_supported, lang_model, lang_utils, s2)])[0]
    if prt:
        print(f"similarity between \"{s1}\" and \"{s2}\" : {sim}")
    return sim

In [20]:
s2 = similarity(stop_words, ln_supported, model, lang_utils, "the purpose of this study is to develop a learning tool for high school student studying the scientific aspect of information and communication network", "scientific")
s3 = similarity(stop_words, ln_supported, model, lang_utils, "introduction public health surveillance system need to be refined", "scientific")

similarity between "the purpose of this study is to develop a learning tool for high school student studying the scientific aspect of information and communication network" and "scientific" : [0.53081813]
similarity between "introduction public health surveillance system need to be refined" and "scientific" : [0.4004291]


In [21]:
def doc_generator(corpus):
    for line in tqdm(corpus):
        yield {
            "_index" : 'toy_data_docs_embb',
            "_source" : {
                "title": line[0],
                "data" : line[1],
                "title_embeddings": vectorize_v3(stop_words, ln_supported, model, lang_utils, line[0]),
                "data_embeddings": vectorize_v3(stop_words, ln_supported, model, lang_utils, line[1])
            },
        }

before executing next cell execute in kibana :
```

DELETE /toy_data_docs_embb

PUT /toy_data_docs_embb
{
  "settings": {
    "index": {
      "knn": true,
      "knn.space_type": "cosinesimil"
    }
  },
  "mappings": {
    "properties": {
      "title": { 
        "type" : "text"
      },
      "data": { 
        "type" : "text"
      },
      "title_embeddings": {
        "type": "knn_vector", 
        "dimension": 600
      },
      "data_embeddings": {
        "type": "knn_vector", 
        "dimension": 600
      }
    }
  }
}
```


In [22]:
out = helpers.bulk(es, doc_generator(corpus))

100%|██████████| 25331/25331 [04:05<00:00, 103.35it/s]


In [85]:
# full vector search
size = 15

query = clean_text("trump")
query_vec = vectorize_v3(stop_words, ln_supported, model, lang_utils, query)

query_body = {
  "size": size,
  "query": {
    "script_score": {
      "query": {
        "match_all": { }
      },
      "script": {
        "source": "cosineSimilarity(params.query_value, doc[params.field1]) + cosineSimilarity(params.query_value, doc[params.field2])",
        "params": {
          "field1": "title_embeddings",
          "field2": "data_embeddings",
          "query_value": query_vec
        }
      }
    }
  }
}

res = es.search(index="toy_data_docs_embb", body=query_body)
print('\n\n'.join(['; '.join([res['hits']['hits'][i]['_source']['title'] + '', res['hits']['hits'][i]['_source']['data']]) for i in range(len(res['hits']['hits']))]))

understanding election candidate approval ratings using social media data; the last few years has seen an exponential increase in the amount of social media data generated daily  thus  researchers have started exploring the use of social media data in building recommendation systems  prediction models  improving disaster management  discovery trending topics etc  an interesting application of social media is for the prediction of election results  the recently conducted 2012 us presidential election was the "most tweeted" election in history and provides a rich source of social media posts  previous work on predicting election outcomes from social media has been largely been based on sentiment about candidates  total volumes of tweets expressing electoral polarity and the like  in this paper we use a collection of tweets to predict the daily approval ratings of the two us presidential candidates and also identify topics that were causal to the approval ratings 

the significance of bid

In [23]:
print(res['took'])

2965


In [31]:
# hybrid v1
size = 15

query = clean_text("lion")
query_vec = vectorize_v3(stop_words, ln_supported, model, lang_utils, query)

query_body = {
  "size": size,
  "query": {
    "function_score": {
      "query": {
        "multi_match": { 
          "query": query,
          "fields": ["data", "title"]
        }
      },
      "script_score": {
        "script": {
          "source": "cosineSimilarity(params.query_value, doc[params.field1]) + cosineSimilarity(params.query_value, doc[params.field2])",
          "params": {
            "field1": "title_embeddings",
            "field2": "data_embeddings",
            "query_value": query_vec
          }
        }
      }
    }
  }
}

res = es.search(index="toy_data_docs_embb", body=query_body)
print('\n\n'.join(['; '.join([res['hits']['hits'][i]['_source']['title'], res['hits']['hits'][i]['_source']['data'], f"        score {res['hits']['hits'][i]['_score']}"]) for i in range(len(res['hits']['hits']))]))

phishing attacks detection using genetic programming; phishing is a real threat on the internet nowadays  according to a re  port released by an american security firm  rsa  there have been approximately 33 000 phishing attacks globally each month in 2012  leading to a loss of  687 mil  lion  therefore  fighting against phishing attacks is of great importance  one popu  lar and widely deployed solution with browsers is to integrate a blacklist sites into them  however  this solution  which is unable to detect new attacks if the database is out of date  appears to be not effective when there are a lager number of phish  ing attacks created very day  in this paper  we propose a solution to this problem by applying genetic programming to phishing detection problem  we conducted the experiments on a data set including both phishing and legitimate sites collected from the internet  we compared the performance of genetic programming with a number of other machine learning techniques and the 

In [86]:
# hybrid v2
size = 15

query = clean_text("trump")
query_vec = vectorize_v3(stop_words, ln_supported, model, lang_utils, query)

query_body = {
  "size": size,
  "query": {
    "function_score": {
      "query": {
        "match_all": { }
      },
      "functions": [
      {
        "filter" : {
          "multi_match": { 
            "query": query,
            "fields": ["data", "title"]
          }
        },
        "weight": 2,
      },
      {
        "script_score" : {
          "script" : {
            "source": "cosineSimilarity(params.query_value, doc[params.field1]) + cosineSimilarity(params.query_value, doc[params.field2])",
            "params": {
              "field1": "title_embeddings",
              "field2": "data_embeddings",
              "query_value": query_vec
            }
          }
        },
        "weight": 1
      }
      ],
      "score_mode": "sum",
      "boost_mode": "sum"
    }
  }
}

res = es.search(index="toy_data_docs_embb", body=query_body)
print('\n\n'.join(['; '.join([res['hits']['hits'][i]['_source']['title'], res['hits']['hits'][i]['_source']['data'], f"        score {res['hits']['hits'][i]['_score']}"]) for i in range(len(res['hits']['hits']))]))

a knowledge framework for natural language analysis; recent research in language analysis and language generation has highlighted the role of knowledge representation in both processes  certain knowledge representation foundations  such as structured inheritance networks and feature based linguistic representations  have proved useful in a variety of language processing tasks  augmentations to this common framework  however  are required to handle particular issues  such as the role relationship problem: the task of determining how roles  or slots  of a given frame  are filled based on knowledge about other roles  three knowledge structures are discussed that address this problem  the semantic interpreter of an analyzer called trump (transportable understanding mechanism package) uses these structures to determine the fillers of roles effectively without requiring excessive specialized information about each frame ;         score 3.5421977

understanding election candidate approval rat

In [38]:
# hybrid v3
size = 500

query = clean_text("cosmologique")
query_vec = vectorize_v3(stop_words, ln_supported, model, lang_utils, query)

query_body = {
  "size": size,
  "query": {
    "function_score": {
      "query": {
        "bool": { 
          "should" : [
            {
              "multi_match" : { 
                "query": query,
                "fields": ["data", "title"]
              }
            },
            {
              "match_all": { }
            }
          ],
          "minimum_should_match" : 0
        }
      },
      "functions": [
      {
        "script_score" : {
          "script" : {
            "source": "cosineSimilarity(params.query_value, doc[params.field1]) + cosineSimilarity(params.query_value, doc[params.field2])",
            "params": {
              "field1": "title_embeddings",
              "field2": "data_embeddings",
              "query_value": query_vec
            }
          }
        },
        "weight": 5
      }
      ],
      "score_mode": "sum",
      "boost_mode": "sum"
    }
  }
}

res = es.search(index="toy_data_docs_embb", body=query_body)
print('\n\n'.join(['; '.join([res['hits']['hits'][i]['_source']['title'], res['hits']['hits'][i]['_source']['data'], f"        score {res['hits']['hits'][i]['_score']}"]) for i in range(len(res['hits']['hits']))]))

sets ;         score 6.2425756

structure preserving runge kutta methods for stochastic hamiltonian equations with additive noise; there has been considerable recent work on the development of energy conserving one step methods that are not symplectic  here we extend these ideas to stochastic hamiltonian problems with additive noise and show that there are classes of runge kutta methods that are very effective in preserving the expectation of the hamiltonian  but care has to be taken in how the wiener increments are sampled at each timestep  some numerical simulations illustrate the performance of these methods ;         score 6.240659

local and global intrinsic dimensionality estimation for better chemical space representation; in this paper  local and global intrinsic dimensionality estimation methods are reviewed  the aim of this paper is to illustrate the capacity of these methods in generating a lower dimensional chemical space with minimum information error  we experimented with

In [39]:
print(res['took'])

3523


In [105]:
# classic match to compare
query = "3d model"
query_vec = vectorize_v3(stop_words, ln_supported, model, lang_utils, clean_text(query))

query_body = {
    "size": 15,
    "query": {
      "multi_match": { 
            "query": query,
            "fields": ["data", "title"]
        }
    }
}
res = es.search(index="toy_data_docs_embb", body=query_body)
print('\n\n'.join(['; '.join([res['hits']['hits'][i]['_source']['title'], res['hits']['hits'][i]['_source']['data'], f"        score {res['hits']['hits'][i]['_score']}"]) for i in range(len(res['hits']['hits']))]))

3d model based segmentation of 3d biomedical images; a central task in biomedical image analysis is the segmen  tation and quantification of 3d image structures  a large variety of seg  mentation approaches have been proposed including approaches based on different types of deformable models  a main advantage of deformable models is that they allow incorporating a priori information about the considered image structures  in this contribution we give a brief overview of often used deformable models such as active contour models  statisti  cal shape models  and analytic parametric models  moreover  we present in more detail 3d analytic parametric intensity models  which enable accurate and robust segmentation and quantification of 3d image struc  tures  such parametric models have been successfully used in different biomedical applications  for example  for the localization of 3d anatom  ical point landmarks in 3d mr and ct images  for the quantification of vessels in 3d mra and cta imag