In [2]:
import os
import re
import string
import numpy as np

import msgpack
import msgpack_numpy as m
m.patch()

import redis
import pandas as pd
from nltk.corpus import stopwords

import cleantext

from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub

In [3]:
df_xlm = pd.read_pickle('final_dataframe.pkl')
df_xlm = df_xlm[['id', 'text', 'lang', 'title', 'pubDate', 'label', 'text_len']]
df_xlm.sample(2)

Unnamed: 0,id,text,lang,title,pubDate,label,text_len
6616,210705_news_415060,Consensus is Harder Than It Looks And it looks...,en,Consensus Is Harder Than It Looks (2020),2021-02-09 10:03:48,1,1277
806,210705_news_258006,BT has reported a near-70% surge in customers ...,en,Working from home fuels surge in BT sign-ups f...,2020-07-31 12:59:25,1,337


In [41]:
len(df_xlm.index)

26954

In [4]:
final_df = pd.read_pickle(os.getcwd()+'/../dataframes/quantum_rank_df.pkl')
final_df.head(2)

Unnamed: 0,id,query,ranking_label,text,text_len,lang,text_tokens,nc_vec,label,label_name,title,pubDate,url
0,210705_news_53540,Quantentechnologie,3,Nach Cyberangriffen auf den Bundestag und das ...,445,de,"[Cyberangriffe, der Bundestag, der Datennetzwe...","[[-0.01808716543018818, -0.0072739748284220695...",1,technology,Netze deutscher Ministerien: Quantenkommunikat...,2019-12-16 10:26:15,https://www.spiegel.de/netzwelt/netzpolitik/qu...
1,210705_news_509066,Quantentechnologie,2,"The following is the May 24, 2021 Congressiona...",506,en,"[the following, the May 24, 2021 Congressional...","[[-0.009140371344983578, -0.002850867807865143...",1,technology,Report on Military Applications for Quantum Co...,2021-05-27 14:12:00,https://news.usni.org/2021/05/27/report-on-mil...


In [6]:
tf_model = hub.load(os.getcwd()+ '/../../models/USE_model')

In [6]:
# custom_pipeline = [preprocessing.fillna,
#                    preprocessing.remove_html_tags,
#                    preprocessing.remove_urls,
#                    preprocessing.remove_digits,
#                    preprocessing.remove_whitespace,
#                   ]

def get_clean_text(text):
    
    clean_text = cleantext.clean(text,
            clean_all= False, # Execute all cleaning operations
            extra_spaces=True ,  # Remove extra white spaces 
            )
    
    clean_text = re.sub(r'http\S+', '', clean_text)
    p = re.compile(r'<.*?>')
    return p.sub('', clean_text)

final_df['text_clean'] = final_df.apply(lambda x:get_clean_text(x['text']), axis=1)

In [7]:
final_df[['text', 'text_clean']]

Unnamed: 0,text,text_clean
0,Nach Cyberangriffen auf den Bundestag und das ...,Nach Cyberangriffen auf den Bundestag und das ...
1,"The following is the May 24, 2021 Congressiona...","The following is the May 24, 2021 Congressiona..."
2,\n No previous knowledge of quantum phy...,No previous knowledge of quantum physics is re...
3,Grundsätzlich haben Quantencomputer das Potenz...,Grundsätzlich haben Quantencomputer das Potenz...
4,Amazon Braket is a fully managed quantum compu...,Amazon Braket is a fully managed quantum compu...
5,"Quantencomputer versprechen, bestimmte Klassen...","Quantencomputer versprechen, bestimmte Klassen..."
6,Die ETH Zürich und das Paul Scherrer Institut ...,Die ETH Zürich und das Paul Scherrer Institut ...
7,In seiner mehr als 100-jährigen Geschichte hat...,In seiner mehr als 100-jährigen Geschichte hat...
8,Der rheinland-pfälzische Pharmakonzern Boehrin...,Der rheinland-pfälzische Pharmakonzern Boehrin...
9,Intel stellt Qubits für den Quantencomputer de...,Intel stellt Qubits für den Quantencomputer de...


In [12]:
stopwords_de = stopwords.words('german')
stopwords_en = stopwords.words('english')

stopwords_full = []
stopwords_full.extend(stopwords_de)
stopwords_full.extend(stopwords_en)

stopwords_full = [word.lower() for word in stopwords_full]

stop_all = set(stopwords_full + list(string.punctuation))

In [55]:
def get_modified_vectors(vec_data):
    
    new_data = []
    for val in vec_data:
        new_data.append(val)
    
    new_data = np.array(new_data).reshape(-1, 512)
    return new_data

def get_pool_vec(doc_vec_list, pool):
    
    doc_vec_list = get_modified_vectors(doc_vec_list)
    if pool == 'mean':
        return np.nanmean(doc_vec_list, axis=0)
    elif pool == 'max':
        return np.nanmax(doc_vec_list, axis=0)

def get_document_vec(text):
    
    return tf_model(text)['outputs'].numpy()[0].reshape(1, -1)

def get_representation_vector(document, title):
    
    title_vec = get_document_vec(title)
    
    document_tokens = document.split()
    doc_len = len(document_tokens)
    doc_vecs = []
    
    doc_vecs.append(title_vec)
    
    if doc_len < 550:
        doc_vecs.append(get_document_vec(document))
    else:
        doc_parts = int(doc_len/500)
        for idx in range(doc_parts):
            if (idx+1)*500 >= doc_len:
                doc_temp = ' '.join(document_tokens[idx*500:])
            else:
                doc_temp = ' '.join(document_tokens[idx*500:(idx+1)*500])
                
            doc_vecs.append(get_document_vec(doc_temp))
        
    return get_pool_vec(get_modified_vectors(doc_vecs), pool='mean')

def get_shorter_text(phrase_1, phrase_2):
    
    if len(phrase_1) < len(phrase_2):
        return phrase_1
    else:
        return phrase_2
    
def remove_stopwords(noun_chunks):
    
    filtered_noun_chunks = []
    
    for word_token in noun_chunks:
        if word_token.lower() not in stop_all:
            filtered_noun_chunks.append(word_token)
            
    return filtered_noun_chunks

def get_filtered_nc(noun_chunks):
    
    noun_chunks = list(set(noun_chunks))
    noun_chunks = remove_stopwords(noun_chunks)
    phrases_len = len(noun_chunks)
    remove_phrases = [] 

    for idx_1 in range(phrases_len):

        phrase_1 = noun_chunks[idx_1]
        for idx_2 in range(idx_1 + 1, phrases_len):
            phrase_2 = noun_chunks[idx_2]

            if fuzz.ratio(phrase_1, phrase_2) > 80:
                remove_phrases.append(get_shorter_text(phrase_1, phrase_2))

    final_noun_chunks = list(set(noun_chunks) - set(remove_phrases))
    return final_noun_chunks

def get_sent_transformers_keywords(repr_vec, noun_chunks, max_keyword_cnt = 30):
    
    noun_chunks = get_filtered_nc(noun_chunks)
    candidate_embeddings = [tf_model(nc)['outputs'].numpy()[0] for nc in noun_chunks]
    
    kw_distances = cosine_similarity([repr_vec], candidate_embeddings)
    
    data_insert_dict = dict()
    keywords_dict = dict()
    for index in kw_distances.argsort()[0][-max_keyword_cnt:]: 
        
        data_insert_dict[noun_chunks[index]] =  m.packb(candidate_embeddings[index])
                
        keywords_dict[noun_chunks[index]] = kw_distances[0][index]
        
    rdb.mset(data_insert_dict)
        
    return keywords_dict

In [15]:
final_df['doc_repr_vec'] = final_df.apply(lambda x:get_representation_vector(x['text_clean'], x['title']), axis=1)

In [56]:
final_df['keywords'] = final_df.apply(lambda x:get_sent_transformers_keywords(x['doc_repr_vec'], x['text_tokens'], max_keyword_cnt = 30), axis=1)

In [62]:
m.unpackb(rdb.get('QuNet'))

ConnectionError: Error 10061 connecting to host.docker.internal:6379. Es konnte keine Verbindung hergestellt werden, da der Zielcomputer die Verbindung verweigerte.

In [37]:
for idx, row in final_df.iterrows():
    
    print(row['id'])
    rdb.set(row['id'], m.packb(row['doc_repr_vec']))

210705_news_53540
210705_news_509066
210705_news_339270
210705_news_285448
210705_news_40840
210705_news_269414
210705_news_488902
210705_news_304057
210705_news_387878
210705_news_158828
210705_news_91745
210705_news_376449
210705_news_280755
210705_news_274384
210705_news_494243
210705_news_357644
210705_news_509162
210705_news_464796
210705_news_449895
210705_news_392072
210705_news_160641
210705_news_471982
210705_news_289800
210705_news_139039


In [59]:
rdb = redis.StrictRedis(
    host='host.docker.internal',
    port=6379,
#     db=0,
#     username='xxx',
#     password='mit22yyy!'
)

rdb.set('France', 'Paris')

True

In [60]:
rdb.get('France')

b'Paris'