In [499]:
import os
import pandas as pd
import numpy as np
from collections import Counter
import spacy

import fasttext
import fasttext.util

from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.metrics.pairwise import cosine_similarity

import tensorflow as tf
import tensorflow_text

import warnings; warnings.simplefilter('ignore')

In [498]:
print(tf.__version__)

2.8.0


In [504]:
tf.saved_model.LoadOptions(
    allow_partial_checkpoint=False,
    experimental_io_device='/job:localhost',
    experimental_skip_checkpoint=False
)

tf_model = tf.keras.models.load_model(
    os.getcwd() + '/models/USE_model/'
)



In [709]:
topic_list = ['Wirtschaft und Finanzen', 'Bildung', 'Politik', 'Tierreich', 'Rechtswissenschaften und Rechtsprechung', 'Gesundheit', 'Automobilbranche', 'Unterhaltung', 'Sport', 'Werbung',  'Innovation', 'Innovation und Technologie', 'Technologie', 'Smartphone', 'iPhone', 'Bitcoin']

topic_embeddings_dict = dict()
for topic in topic_list:
    topic_embeddings_dict[topic] = tf_model(topic)['outputs'].numpy()[0].reshape(1, -1)

In [530]:
def get_use_topic_score(doc_vec, topic):
    
    return cosine_similarity(doc_vec, topic_embeddings_dict[topic])[0][0]

In [519]:
vec_list = embeddings['outputs'].numpy()

In [521]:
cosine_similarity(vec_list[0].reshape(1, -1), vec_list[1].reshape(1, -1))[0][0]

0.83537215

In [468]:
model_path = os.getcwd() + '/models/cc.de.300.bin'
fasttext_model = fasttext.load_model(model_path)



In [484]:
w1 = 'bauen'
w2 = 'einbauen'
w3 = 'aufbauen'
w4 = 'verbauen'

vec_1 = fasttext_model.get_word_vector(w1).reshape(1, -1)
vec_2 = fasttext_model.get_word_vector(w2).reshape(1, -1)
vec_3 = fasttext_model.get_word_vector(w3).reshape(1, -1)
vec_4 = fasttext_model.get_word_vector(w4).reshape(1, -1)

print(f'{w1} -- {w2} {cosine_similarity(vec_1, vec_2)[0][0]}')
print(f'{w1} -- {w3} {cosine_similarity(vec_1, vec_3)[0][0]}')
print(f'{w1} -- {w4} {cosine_similarity(vec_1, vec_4)[0][0]}')

bauen -- einbauen 0.632020115852356
bauen -- aufbauen 0.7113096714019775
bauen -- verbauen 0.6564719676971436


In [154]:
def dateien_lesen(pfad):
    files = os.listdir(pfad)
    text_list = []
    for file_name in files:
        f = open(pfad + "/" + file_name, 'r', encoding="utf8")
        text_list.append(f.read())
        f.close()
    return text_list

In [156]:
pfad_inno = os.getcwd() + "/Innovation"
pfad_neg = os.getcwd() + "/negative"

pfad_test = os.getcwd() + "/test"

In [4]:
text_list_inno = dateien_lesen(pfad_inno)
text_list_neg = dateien_lesen(pfad_neg)

text_list_test = dateien_lesen(pfad_test)

In [5]:
inno_dict_as_set = set()
for docm in text_list_inno:
    for token in docm.split():
        if not token[0].isupper():
            inno_dict_as_set.add(token)  

In [6]:
neg_dict_as_set = set()
for docm in text_list_neg:
    for token in docm.split():
        if not token[0].isupper():
            neg_dict_as_set.add(token)  

In [10]:
nlp = spacy.load("de_core_news_sm")

In [173]:
def get_verb_adj_data(doc_list):
    
    va_list = []
    for docm in doc_list:
        sublist = []
        for token in nlp(docm):
            if token.pos_ in ["VERB", "ADJ"]:
                sublist.append(token.lemma_)
        va_list.append((Counter(sublist), len(docm.split())))
        
    return va_list

def get_verb_adj_score(doc_list, va_list, reference_list, label):
    
    count1=0
    score_sets = []

    for i, obj in enumerate(va_list):
        points = 0
        for ref_term in reference_list:
            if ref_term in obj[0].keys():
                points = points + obj[0][ref_term]

        score_sets.append((doc_list[i],label, points, points/obj[1]))
    
    return score_sets

In [11]:
va_list1 = get_verb_adj_data(text_list_inno)
va_list2 = get_verb_adj_data(text_list_neg)
va_list3 = get_verb_adj_data(text_list_test)

In [526]:
# va_list1

In [334]:
def is_doc_unrelated(score, threshold=0.4):
    if score > threshold:
        return 1
    return 0

def get_f1score_threshold(preds, gt, threshold):
    
    relevancy_list = []

    for score in preds:
        rel = is_doc_unrelated(score, threshold)
        relevancy_list.append(rel)
        
#     print(classification_report(gt, relevancy_list))
    
    total_correct = 0
    for pred, cor in zip(relevancy_list, gt):
        if cor == 1 and pred == 1:
            total_correct += 1
    
    return total_correct, precision_recall_fscore_support(gt, relevancy_list)[2][1]

def get_best_f1_score(preds, gt):
    
    threshold_vals = np.arange(0.0, 0.02, 0.0001)
    f1_scores_list = []
    max_threshold = None

    for val in threshold_vals:
        val = round(val, 2)
        relevancy_list = []

        for score in preds:
            rel = is_doc_unrelated(score, threshold=val)
            relevancy_list.append(rel)

        f1_scores_list.append(precision_recall_fscore_support(gt, relevancy_list)[2][1])
    
    max_f1score = max(f1_scores_list)
    max_threshold = threshold_vals[f1_scores_list.index(max_f1score)]
    
    return max_threshold, max(f1_scores_list)

In [693]:
reference_list = [
    #"installieren",
    #"kompilieren",
    "künstlich",
    "automatisch",
    "automatisieren",
    "hochautomatisiert",
    "konfigurierbar"
    "experimentell",
    #"fahrerlos",
    "intelligent",
    #"smart",
    "entwickeln",
    #"synthetisieren",
    "lösen",
    "energiesparend",
    "energiefressend",
    "mathematisch",
    #"mechanisch",
    #"neuronal",
    "umbauen",
    #"digital",
    "digitalisieren",
    "zukünftig",
    "innovativ",
    "technisch",
    "autonom",
    "detektieren",
    "energetisch",
    "evaluieren",
    "erkennen",
    "experimentieren",
    "verbessern",
    "portabel",
#     "portieren",
    "programmieren",
    "technologisch",
    "softwaredefiniert",
    "softwaregestützt",
    #"spektral",
    #"spezifisch",
    "trainiert",
    "serviceorientiert",
    #"adaptiv"
]

In [457]:
reference_list_2 = [
    'analysieren',
    'erforschen',
    'identifizieren',
    'erforschen',
    'beobachten',
    'aufspüren',
    'hervorheben',
    'betonen',
    'bewerten',
    'beeinflussen',
    'feststellen',
    'beweisen',
    'rechtfertigen',
    'nachprüfen',
    'fortschreiten',
    'entwickeln',
    'weiterentwickeln',
    'übertreffen',
    'steigern',
    'maximieren',
    'transformieren',
    'umwandeln',
    'approximieren',
    'schätzen',
    'begründen',
    'illustrieren',
    'vorhersagen',
    'validieren',
    'reproduzieren',
    'erkennen',
    'klassifizieren',
    'reklamieren',
    'auflösen',
    'formulieren',
    'forschen',
    'untersuchen',
    'integrieren',
    'interpolieren',
    'empfehlen',
    'anzeigen',
    'interpretieren',
    'aufzeigen',
    'ableiten',
    'erklären',
    'gestalten',
    'erfinden',
    'entdecken',
    'modernisieren',
    'revolutionieren',
    'entscheidend',
    'bedeutend', 
    'aufbauen',
    'ermitteln',
    'erreichen',
    'berichten',
    'optimieren'
]
# reference_list = list(set(reference_list+reference_list_2))

In [694]:
innovation_score_sets = get_verb_adj_score(text_list_inno, va_list1, reference_list, 1)
negative_score_sets = get_verb_adj_score(text_list_neg, va_list2, reference_list, 0)

test_score_sets = get_verb_adj_score(text_list_test, va_list3, reference_list, 1)

In [695]:
train_documents = innovation_score_sets + negative_score_sets

In [696]:
doc_df = pd.DataFrame(train_documents, columns=["document_text", "label", "tf_score", "tf_score_weighted"])
doc_df.sample(2)

Unnamed: 0,document_text,label,tf_score,tf_score_weighted
381,\n\t\t\t\t\t Microsofts Betriebssystem Windows...,0,2,0.001854
962,Trotz noblem Tagungsort direkt an der Potsdame...,0,1,0.001214


In [461]:
doc_df_test = pd.DataFrame(test_score_sets, columns=["document_text", "label", "tf_score", "tf_score_weighted"])

In [697]:
doc_df_pos = doc_df[doc_df['label'] == 1]
doc_df_neg = doc_df[doc_df['label'] == 0]

doc_df_pos = doc_df_pos.sort_values(by=['tf_score_weighted'], ascending=False)
doc_df_neg = doc_df_neg.sort_values(by=['tf_score_weighted'], ascending=False)
# doc_df_test = doc_df_test.sort_values(by=['tf_score_weighted'], ascending=False)

np.savetxt(os.getcwd()+"/output/positive_tf_scores.txt", doc_df_pos.tf_score_weighted.values, delimiter=",", fmt='%f')
np.savetxt(os.getcwd()+"/output/negative_tf_scores.txt", doc_df_neg.tf_score_weighted.values, delimiter=",", fmt='%f')
# np.savetxt(os.getcwd()+"/output/test_tf_scores.txt", doc_df_test.tf_score_weighted.values, delimiter=",", fmt='%f')

In [463]:
# threshold, f1_score = get_best_f1_score(doc_df['tf_score_weighted'].values, doc_df['label'].values)
threshold = 0.0021

In [464]:
get_f1score_threshold(doc_df['tf_score_weighted'].values, doc_df['label'].values, threshold)

(29, 0.058232931726907626)

In [465]:
get_f1score_threshold(doc_df_test['tf_score_weighted'].values, doc_df_test['label'].values, threshold)

(18, 0.6206896551724138)

In [466]:
# Verb Score approach
# threshold: 0.002107
# neg: 260 out of 1426
# pos: 24 out of 30

# USE Score approach
# threshold: 0.046012
# neg: 180 out of 1426
# pos: 21 out of 30

# threshold: 0.0051
# f1-score: 0.327

# test 40 docs
# 12 out of 40

In [527]:
doc_df['doc_vec'] = doc_df.apply(lambda x:tf_model(x['document_text'])['outputs'].numpy()[0].reshape(1, -1), axis=1)

In [710]:
for topic in topic_list:
    topic_col_name = topic.lower().replace(' ', '_') + '_sim'
    doc_df[topic_col_name] = doc_df.apply(lambda x:get_use_topic_score(x['doc_vec'], topic), axis=1)
    
doc_df.columns

Index(['document_text', 'label', 'tf_score', 'tf_score_weighted', 'doc_vec',
       'wirtschaft_und_finanzen_sim', 'bildung_sim', 'politik_sim',
       'tierreich_sim', 'rechtswissenschaften_und_rechtsprechung_sim',
       'gesundheit_sim', 'automobilbranche_sim', 'unterhaltung_sim',
       'sport_sim', 'werbung_sim', 'innovation_sim',
       'innovation_und_technologie_sim', 'technologie_sim', 'smartphone_sim',
       'hybrid_score', 'iphone_sim', 'bitcoin_sim'],
      dtype='object')

In [721]:
def get_hybrid_score(a, b, c, d, l, e, f):
    
    if c >= min(a,b) or (a<0 and b<0) or d>b or (e)>0.2 or f>0.1:
        return 0
#     if d >= b:
#         return 0
#     if d>b:
    print(f'l: {l}, a: {round(a, 2)}, b: {round(b, 2)}, c: {round(c, 2)}, d: {round(d, 2)}, e: {round(e, 2)}, f: {round(f, 2)}')
    
    if c>0:
        return b
    else:
        return a+b-c

# doc_df['hybrid_score'] = doc_df.apply(lambda x:get_hybrid_score(x['innovation_sim'], x['technologie_sim'], max(x['wirtschaft_und_finanzen_sim'], x['bildung_sim'], x['politik_sim'],x['tierreich_sim'], x['rechtswissenschaften_und_rechtsprechung_sim'],x['gesundheit_sim'], x['automobilbranche_sim'], x['unterhaltung_sim'],x['sport_sim'], x['werbung_sim'], x['anzeige_sim'])), axis=1)
doc_df['hybrid_score'] = doc_df.apply(lambda x:get_hybrid_score(x['innovation_sim'], x['technologie_sim'], x['werbung_sim'], x['smartphone_sim'], x['label'], x['wirtschaft_und_finanzen_sim'] + x['bitcoin_sim'], x['iphone_sim']), axis=1)
doc_df['label'].corr(doc_df['hybrid_score'])

l: 1, a: 0.03, b: 0.08, c: 0.02, d: 0.04, e: 0.06, f: -0.09
l: 1, a: 0.09, b: 0.05, c: -0.0, d: 0.02, e: -0.03, f: -0.06
l: 1, a: -0.0, b: 0.0, c: -0.04, d: -0.05, e: -0.02, f: -0.06
l: 1, a: 0.08, b: 0.04, c: -0.02, d: 0.01, e: -0.01, f: -0.03
l: 1, a: -0.03, b: 0.07, c: -0.04, d: -0.02, e: 0.14, f: -0.02
l: 1, a: 0.02, b: 0.01, c: 0.0, d: -0.03, e: -0.08, f: -0.1
l: 1, a: 0.05, b: 0.1, c: -0.08, d: -0.04, e: 0.02, f: -0.06
l: 1, a: 0.04, b: 0.09, c: -0.03, d: 0.02, e: 0.04, f: -0.02
l: 1, a: 0.02, b: 0.05, c: -0.03, d: 0.03, e: -0.05, f: -0.02
l: 1, a: 0.08, b: 0.03, c: -0.0, d: -0.02, e: 0.18, f: -0.03
l: 1, a: 0.09, b: 0.02, c: -0.04, d: -0.02, e: -0.03, f: -0.06
l: 1, a: -0.02, b: 0.1, c: -0.05, d: -0.03, e: 0.06, f: -0.04
l: 1, a: 0.03, b: 0.12, c: -0.04, d: 0.03, e: 0.06, f: -0.02
l: 1, a: 0.01, b: 0.02, c: -0.02, d: 0.02, e: 0.01, f: -0.07
l: 1, a: -0.01, b: 0.06, c: -0.1, d: -0.03, e: -0.1, f: -0.06
l: 1, a: 0.13, b: 0.11, c: 0.04, d: 0.07, e: 0.01, f: 0.02
l: 1, a: 0.03, b: 0

0.2823849253285573

In [722]:
doc_df_pos = doc_df[doc_df['label'] == 1]
doc_df_neg = doc_df[doc_df['label'] == 0]

doc_df_pos = doc_df_pos.sort_values(by=['hybrid_score'], ascending=False)
doc_df_neg = doc_df_neg.sort_values(by=['hybrid_score'], ascending=False)
# doc_df_test = doc_df_test.sort_values(by=['tf_score_weighted'], ascending=False)

np.savetxt(os.getcwd()+"/output/positive_innovation_sim_scores.txt", doc_df_pos.hybrid_score.values, delimiter=",", fmt='%f')
np.savetxt(os.getcwd()+"/output/negative_innovation_sim_scores.txt", doc_df_neg.hybrid_score.values, delimiter=",", fmt='%f')
# np.savetxt(os.getcwd()+"/output/test_tf_scores.txt", doc_df_test.tf_score_weighted.values, delimiter=",", fmt='%f')

In [600]:
doc_df['label'].corr(doc_df['tf_score_weighted'])

0.1982997247242723

In [713]:
for idx, row in doc_df_neg.iterrows():
    
    if row['hybrid_score'] > 0.1 and row['hybrid_score'] < 0.2:
        print(row)
        
        print('\n\n')

document_text                                  Micron hat einen Käufer für das Halbleiterwerk...
label                                                                                          0
tf_score                                                                                       0
tf_score_weighted                                                                            0.0
doc_vec                                        [[0.07143805, -0.0164167, 0.027678376, 0.01744...
wirtschaft_und_finanzen_sim                                                             0.024937
bildung_sim                                                                            -0.032584
politik_sim                                                                            -0.061248
tierreich_sim                                                                          -0.016644
rechtswissenschaften_und_rechtsprechung_sim                                             0.004191
gesundheit_sim                

In [528]:
doc_df.to_pickle(os.getcwd() + f'/notebooks/dataframes/verb_based_classifier.pkl')

In [700]:
doc_df = pd.read_pickle(os.getcwd() + f'/notebooks/dataframes/verb_based_classifier.pkl')