In [288]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from collections import OrderedDict
def get_sports():
    df_sport_latest = pd.read_csv('articles/sports_articles.csv', encoding = "ISO-8859-1")
    df_sport_latest_tv2 = pd.read_csv('articles/sports_articles_tv2.csv', encoding = "ISO-8859-1")
    df_sport_2019 = pd.read_csv('articles/sports_articles_2019.csv', encoding = "ISO-8859-1")
    df_sport_2020 = pd.read_csv('articles/sports_articles_2020.csv', encoding = "ISO-8859-1")
    df_sport_2022 = pd.read_csv('articles/sports_articles_2022.csv', encoding = "ISO-8859-1")
    df = pd.concat([df_sport_latest, df_sport_latest_tv2, df_sport_2019, df_sport_2020, df_sport_2022])
    df = df.sample(frac=1).reset_index(drop=True)
    return df


In [289]:
def vocab_2_pdset(columns, df):
    df_vocab_select_columns = df.iloc[:, columns]
    vocab_all_values = df_vocab_select_columns.values.ravel()
    return set(vocab_all_values)

def vocab_2_dict(sets):
    assert(len(sets) == 4)
    word_set = sets[0].union(sets[1],sets[2], sets[3])
    df = pd.DataFrame(list(word_set), columns=["Words"])
    df.sort_values(by="Words", inplace=True)
    df.reset_index(drop=True, inplace=True)
    return OrderedDict.fromkeys(word_set)

def get_nationalities_list():
    df_nationalities = pd.read_csv('nat2.csv', encoding = "ISO-8859-1", header=None)
    nationalities = df_nationalities.fillna('').iloc[:,:].values.ravel().tolist()
    return [x for x in nationalities if x!= '']


def get_vocab_dict():
    df_ods_vocab = pd.read_table('ods_fullforms_2020-08-26.csv', header=None)
    df_ddo_vocab = pd.read_table('ddo_fullforms_2020-08-26.csv', header=None)
    df_vocab = pd.read_table('cor1.02.tsv', header=None)
    df_sport_lingo = pd.read_table('sport_lingo.csv', header=None)

    vocab_set = vocab_2_pdset([1,3], df_vocab)
    ods_vocab_set = vocab_2_pdset([0,1], df_ods_vocab)
    ddo_vocab_set = vocab_2_pdset([0,1], df_ddo_vocab)
    sport_lingo_set = vocab_2_pdset([0], df_sport_lingo)

    d = vocab_2_dict([vocab_set, ods_vocab_set, ddo_vocab_set, sport_lingo_set])
    d = {key.lower() if isinstance(key, str) else key: value for key, value in d.items()}

    return d

ordered_dict = get_vocab_dict()
df_sport = get_sports()
nationalities = get_nationalities_list()



In [290]:

def combine_articles_to_csv():
    df_sport_combined = df_sport.copy().drop('Link', axis=1)
    df_sport_combined.to_csv('articles_temp/combined.csv')

combine_articles_to_csv()


In [291]:
duplicate_rows = df_sport.duplicated()
print("Duplicates in data points: ")
print(df_sport[duplicate_rows])

Duplicates in data points: 
Empty DataFrame
Columns: [Category, Headline, SubHeading, Link, isResult, isMaybe]
Index: []


In [292]:
import time
isin_dict = False
def test_lookup_performance():
    word_to_check = "Dansk"
    start_time = time.time()

    for x in range(1000000):
        isin_dict = word_to_check in ordered_dict

    end_time = time.time()  
    assert(end_time - start_time < 1)
    print(isin_dict)

test_lookup_performance()

# isin_dict


False


In [341]:
import re

def split_specials(word):
    words_new = []
    parts = re.findall(r"[A-ZÆØÅa-zæøå0-9]+|\S", word)
    words_new.extend([x for x in parts])
    return words_new

def contains_non_alphanumeric(word):
    return bool(re.search(r'[^a-zæøåA-ZÆØÅ0-9]', word))

def remove_numeric(words):
    return [x for x in words if any(char.isdigit() for char in x) == False]
    
def split_sentences(sentences):
    words_arr = []
    for ind, sentence in enumerate(sentences):
        sentence_trimmed = sentence.strip()
        words = sentence_trimmed.split()
        for word in words:
            w = split_specials(word)
            words_arr.extend([x.lower() for x in w])
    return words_arr


def remove_duplicates(words):
    return list(set(words))

def remove_nationalities(words, nationalities):
    words_minus_nationalities = []
    for w in words:
        result = any(w.startswith(item.lower()) for item in nationalities)
        if result == False:
            words_minus_nationalities.append(w)
    
    return list(set(words_minus_nationalities))

def remove_non_dict_words(words, dict):
    words_in_dict = []
    words_not_in_dict = []
    for w in words:
        isin_dict = w in dict
        if isin_dict == True:
            words_in_dict.append(w)
        else:
            words_not_in_dict.append(w)
     
    return words_in_dict, words_not_in_dict

def add_non_alpha_numeric(words, non_alpha):
    ws = words
    ws.extend(non_alpha)
    return ws


In [342]:
train_text = df_sport.iloc[:, [0,1,2]].apply(' . '.join, axis=1).replace('\xa0', '', regex=True).to_numpy()

words_arr = split_sentences(train_text)
words_arr = remove_duplicates(words_arr)
words_arr = remove_nationalities(words_arr, nationalities)
words_arr = remove_numeric(words_arr)

words_train_vocab, words_sport_lingo = remove_non_dict_words(words_arr, ordered_dict)

non_alpha = [":" , "/", ":", ",", "'", ".", "?", "-", "!", "(", ")", '"']

words_train_vocab = add_non_alpha_numeric(words_train_vocab, non_alpha)


# # TODO : brug tensorflow Tokenezier til at omdanne ord til tokens
# # TODO : søg i alle leksikoner, søg med og uden bindestreg
# # TODO : håndter tal ikke i ordbøger eks ( x-x eller x-årig)
# # TODO : lemmatizer : udelad bøjninger af samme navneord. eks : verdensmester/verdensmesteren
# # TODO : evt. grupper ord der ofte hænger sammen med nltk BigramFinder. eks vandt over
# TODO : fjern evt. også alle navne (fornavne og efternavne)  

print("total unique words:", len(words_arr) )
print("total sports lingo words:", len(words_sport_lingo) )
print("total vocab:", len(words_train_vocab))
print("total articles:", len(df_sport) )

total unique words: 5771
total sports lingo words: 1281
total vocab: 4502
total articles: 1110


In [338]:
file = open('words_sport_lingo.txt','w')
for item in words_sport_lingo:
	file.write(item+"\n")
file.close()

file = open('words_train_vocab.txt','w')
for item in sorted(words_train_vocab):
	file.write(item+"\n")
file.close()

In [312]:
def format_2_bool(x):
    if type(x) == bool:
        return x
    assert(type(x) == str)
    x_copy = x
    x_copy = x_copy.strip()
    x_copy = x_copy.lower()
    assert(x_copy == "true" or x_copy == "false")
    if x_copy == "true":
        return True
    else:
        return False

df_sport_labels = df_sport['isResult'].apply(lambda x: format_2_bool(x))

results_true = df_sport_labels.loc[df_sport_labels== True]
results_false = df_sport_labels.loc[df_sport_labels == False]

assert(len(results_true) + len(results_false) == len(df_sport_labels))

labels = df_sport_labels.to_numpy().astype(int)

print("Labels True: " , len(results_true))
print("Labels False: ", len(results_false))


Labels True:  537
Labels False:  573


In [327]:
assert (len(labels) == len(train_text))
print("Data:")
for t in range(len(train_text)):
    if labels[t] == 0:
        print(train_text[t])
        print(labels[t], "\n")

Data:
SUPERLIGA  . Fodboldfans råber op efter bøder for kritiske bannere:'Hvor går grænsen?'  . Talsperson i FC Københavns fanklub Casper Fischer Raavig og Christian Rothmann, der er formand for Danske Fodbold Fans, er bekymrede over udviklingen for tildeling af bøder 
0 

SPORT  . Fuld fart frem uden bremser:Leon vil vinde VM i speedway  . DR Sporten har mødt Leon Madsen til en snak om forventninger, ni hjernerystelser og verdensmesterskabet i speedway 
0 

CYKLING  . Cykelstjernen Roglic skifter til tysk hold  . Primoz Roglic kommer derfor ikke længere til at være holdkammerat med Jonas Vingegaard 
0 

Fodbold  . Derfor gik spansk stjerne på banen med hørebøffer på . Psykiatrifonden anbefaler personer med autisme at benytte hørebøffer til at afskærme støj 
0 

CHAMPIONS LEAGUE  . Efter frygtelig nyhed har Tranborg fundet roen:'Ved at jeg kommer ud på den anden side'  . Karrierens anden korsbåndsskade koster Mette Tranborg et VM på hjemmebane, men Esbjerg-profilen har prøvet det før o

In [375]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

# TODO : evt indikere hvilke navneord der starte med stort bogstav(egenavne), evt. lave et opslag for at undersøge ordklasse for det første ord i sætningen 

def to_lower(word):
    return tf.strings.lower(word, encoding='utf-8')

def split_specials(input_data):

    new_str = input_data
    for sign in non_alpha:
        r = "\\" + sign
        new_str = tf.strings.regex_replace(new_str, pattern=r, rewrite=" " + sign + " ")

    return new_str

def replace_digits(word):
    r = tf.strings.regex_replace(word, pattern=r'\d+', rewrite=r'xx')
    return r

def custom_standardization(input_data):
    lowercase = to_lower(input_data)
    s = split_specials(lowercase)
    return replace_digits(s)

def vect_layer_2_text(vect_l):
    return np.array([vect_vocab[x] for x in np.trim_zeros(np.squeeze(vect_l.numpy()))])

# Model constants.
max_features = 4600
sequence_length = 100

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)
def prepare_vocab(words):
    words_copy = words.copy()
    # add the word 'xx' to the allowed vocabulary representing all numbers
    words_copy.extend(["xx"])
    return words_copy


text_ds = vectorize_layer.adapt(prepare_vocab(words_train_vocab))
vect_vocab = vectorize_layer.get_vocabulary()

print("Total vocab/max_features : ",  len(vect_vocab))


Total vocab/max_features :  4504


In [376]:
for t in train_text[0:50]:
    print("Original \n:", t)
    print("vect_2_text: \n", vect_layer_2_text(vectorize_layer([t])))
    print("\n")


Original 
: KORT SPORT  . Brøndby vender skidt stime mod Lyngby  . Efter to nederlag i træk kom Brøndby søndag tilbage på vinderkurs iSuperligaenmed en 3-0-sejr hjemme over Lyngby 
vect_2_text: 
 ['kort' 'sport' '.' 'brøndby' 'vender' 'skidt' 'stime' 'mod' '[UNK]' '.'
 'efter' 'to' 'nederlag' 'i' 'træk' 'kom' 'brøndby' 'søndag' 'tilbage'
 'på' 'vinderkurs' '[UNK]' 'en' 'xx' '-' 'xx' '-' 'sejr' 'hjemme' 'over'
 '[UNK]']


Original 
: SUPERLIGA  . Fodboldfans råber op efter bøder for kritiske bannere:'Hvor går grænsen?'  . Talsperson i FC Københavns fanklub Casper Fischer Raavig og Christian Rothmann, der er formand for Danske Fodbold Fans, er bekymrede over udviklingen for tildeling af bøder 
vect_2_text: 
 ['superliga' '.' 'fodboldfans' 'råber' 'op' 'efter' 'bøder' 'for'
 'kritiske' 'bannere' ':' "'" 'hvor' 'går' 'grænsen' '?' "'" '.'
 'talsperson' 'i' '[UNK]' '[UNK]' 'fanklub' '[UNK]' '[UNK]' '[UNK]' 'og'
 'christian' '[UNK]' ',' 'der' 'er' 'formand' 'for' '[UNK]' 'fodbold'
 'fans' ',

In [306]:
def split_data(data, labels, percentage):
    l = len(data)
    p = l - int((percentage/100) * l)
    return (data[0:p], data[p:], labels[0:p], labels[p:])


train_data, val_data, train_labels, val_labels = split_data(vectorize_layer(train_text), labels, 8)

print("Total data: ", len(train_text))
print("Train data length: ", len(train_data))
print("Validation data length: ", len(val_data))


Total data:  1110
Train data length:  1022
Validation data length:  88


In [301]:
class TransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="relu"), keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions



In [302]:
from tensorflow.keras import layers
import random as python_random


def get_transformer_model():

    embed_dim = 128  # Embedding size for each token
    num_heads = 2  # Number of attention heads
    ff_dim = 128  # Hidden layer size in feed forward network inside transformer


    # A integer input for vocab indices.
    inputs = tf.keras.Input(shape=(sequence_length,), dtype="int64")

    # Next, we add a layer to map those vocab indices into a space of dimensionality
    # 'embedding_dim'.
    # x = layers.Embedding(max_features, embed_dim)(inputs)

    embedding_layer = TokenAndPositionEmbedding(sequence_length, max_features, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)

    # x = layers.Dropout(0.1)(x)

    # Conv1D + global max pooling
    # x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)
    # x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)


    x = layers.GlobalMaxPooling1D()(x)

    # We add a vanilla hidden layer:
    # x = layers.Dense(128, activation="relu")(x)
    # x = layers.Dropout(0.5)(x)

    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)


    transformer_model = tf.keras.Model(inputs, predictions)

    # Compile the model with binary crossentropy loss and an adam optimizer.
    transformer_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return transformer_model


In [303]:
from tensorflow.keras import layers
import random as python_random

def get_cnn_model():

    embedding_dim = 96

    # A integer input for vocab indices.
    inputs = tf.keras.Input(shape=(None,), dtype="int64")

    # Next, we add a layer to map those vocab indices into a space of dimensionality
    # 'embedding_dim'.
    x = layers.Embedding(max_features, embedding_dim)(inputs)
    x = layers.Dropout(0.5)(x)

    # Conv1D + global max pooling
    x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)
    x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)


    x = layers.GlobalMaxPooling1D()(x)

    # We add a vanilla hidden layer:
    x = layers.Dense(128, activation="relu")(x)
    # x = layers.Dropout(0.5)(x)

    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)
    cnn_model = tf.keras.Model(inputs, predictions)

    # Compile the model with binary crossentropy loss and an adam optimizer.
    cnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return cnn_model



In [304]:

def prepare_model(name):
    if (name == "cnn"):
       return get_cnn_model()
    elif (name == "transformer"):
       return get_transformer_model()
  

def filter_max_accuracy(history, threshold = 0.95):
    acc = history.history["accuracy"]
    val_acc = history.history["val_accuracy"]
    list = []
    for x in range(len(acc)):
        if (acc[x] > threshold):
            list.append(val_acc[x])

    return np.array(list)

models = ["cnn", "transformer"]


def mean_model_accuracy(mode_names, iterations, epochs = 20):

    callback_3_loss = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=4)

    results = []

    for name in range(len(mode_names)):
        model_name = mode_names[name]
        val_accuracies = []
        
        for x in range(iterations):
            model = prepare_model(model_name)

            # Fit the model using the train and test datasets.
            history = model.fit(train_data, train_labels, epochs=epochs, batch_size=6, validation_data=(val_data, val_labels), callbacks=[callback_3_loss])

            max_val_acc = filter_max_accuracy(history)
            val_accuracies.append(max(max_val_acc))
            print(max(max_val_acc))
            print(val_accuracies)
        
        d = dict(name = model_name, results = np.mean(np.squeeze(np.array(val_accuracies))))
        results.append(d)
        
    return results


In [305]:
mean_results = mean_model_accuracy(models, 10)
mean_results

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
0.9090909361839294
[0.9090909361839294]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
0.8977272510528564
[0.9090909361839294, 0.8977272510528564]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
0.8863636255264282
[0.9090909361839294, 0.8977272510528564, 0.8863636255264282]
Epoch 1/20
Epoch 2/20

KeyboardInterrupt: 

In [None]:
def result_format_round(result):
    return round(result)

def result_format_none(result):
    return result

def print_model_score(model):
    score = model.evaluate(val_data, val_labels, verbose=0)
    print("Validation loss:", score[0])
    print("Validations accuracy:", score[1])

def print_validation_results(predictions, val_data, labels, formatter):
    print("Number of predictions", len(predictions))
    n_correct = 0
    for x in range(len(val_data)):
        print("VALIDATION SAMPLE: \n" ,vect_layer_2_text(val_data[x]))
        print("LABEL --:" , formatter(labels[x]), " ---- float: ", labels[x])
        print("PREDICTION --:" , formatter(predictions[x][0]), " ---- float: ", predictions[x][0])
        print("\n")
        if result_format_round(labels[x]) == result_format_round(predictions[x][0]):
            n_correct += 1
    
    print("Number correct: ", n_correct)

In [None]:

epochs= 7
transformer_model = get_transformer_model()

# Fit the model using the train and test datasets.
transformer_history = transformer_model.fit(train_data, train_labels, epochs=epochs, batch_size=6, validation_data=(val_data, val_labels))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [None]:

epochs= 5
cnn_model = get_cnn_model()

# # Fit the model using the train and test datasets.
transformer_history = cnn_model.fit(train_data, train_labels, epochs=epochs, batch_size=6, validation_data=(val_data, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
def print_results(model):
    np.set_printoptions(precision = 5, suppress = True)
    predictions = model.predict(val_data)
    print_model_score(model)
    print("\n")
    print_validation_results(predictions, val_data, val_labels, result_format_round)
  

In [None]:
print("--- TRANSFORMER ---")
print_results(transformer_model)

--- TRANSFORMER ---
Validation loss: 0.27667349576950073
Validations accuracy: 0.8863636255264282


Number of predictions 88
VALIDATION SAMPLE: 
 ['cykling' '.' 'tidligere' 'verdensmester' 'savner' 'afklaring' 'om'
 '[UNK]' 'med' '[UNK]' 'hold' '.' '[UNK]' '[UNK]' 'er' 'ikke' 'synderligt'
 'begejstret' 'over' ',' 'at' 'hans' 'hold' 'potentielt' 'skal' 'slås'
 'sammen' 'med' '[UNK]' '[UNK]' 'hold']
LABEL --: 0  ---- float:  0
PREDICTION --: 0  ---- float:  0.0018245322


VALIDATION SAMPLE: 
 ['[UNK]' 'X' '.' 'guldfavoritten' 'skuffede' 'og' 'følte' 'sig' 'som' 'en'
 'dårlig' 'joke' ':' 'på' 'ol' "'" 's' 'sidste' 'dag' 'fik' 'hun'
 'endelig' 'et' '[UNK]' '.' 'heller' 'ikke' 'fjerde' 'gang' 'blev'
 'lykkens' 'gang' 'for' 'den' 'alpine' 'skiløber' '[UNK]' '[UNK]']
LABEL --: 1  ---- float:  1
PREDICTION --: 1  ---- float:  0.7941697


VALIDATION SAMPLE: 
 ['badminton' '.' '-' 'det' 'er' 'noget' 'af' 'det' 'vildeste' ',' 'der'
 'er' 'set' 'i' 'løbet' 'af' 'turneringen' '.' '[UNK]' '[UNK]' 'b

In [None]:

print("--- CNN ---")
print_results(cnn_model)

--- CNN ---
Validation loss: 0.7952285408973694
Validations accuracy: 0.7840909361839294


Number of predictions 88
VALIDATION SAMPLE: 
 ['cykling' '.' 'tidligere' 'verdensmester' 'savner' 'afklaring' 'om'
 '[UNK]' 'med' '[UNK]' 'hold' '.' '[UNK]' '[UNK]' 'er' 'ikke' 'synderligt'
 'begejstret' 'over' ',' 'at' 'hans' 'hold' 'potentielt' 'skal' 'slås'
 'sammen' 'med' '[UNK]' '[UNK]' 'hold']
LABEL --: 0  ---- float:  0
PREDICTION --: 0  ---- float:  0.00011350681


VALIDATION SAMPLE: 
 ['[UNK]' 'X' '.' 'guldfavoritten' 'skuffede' 'og' 'følte' 'sig' 'som' 'en'
 'dårlig' 'joke' ':' 'på' 'ol' "'" 's' 'sidste' 'dag' 'fik' 'hun'
 'endelig' 'et' '[UNK]' '.' 'heller' 'ikke' 'fjerde' 'gang' 'blev'
 'lykkens' 'gang' 'for' 'den' 'alpine' 'skiløber' '[UNK]' '[UNK]']
LABEL --: 1  ---- float:  1
PREDICTION --: 0  ---- float:  0.06761078


VALIDATION SAMPLE: 
 ['badminton' '.' '-' 'det' 'er' 'noget' 'af' 'det' 'vildeste' ',' 'der'
 'er' 'set' 'i' 'løbet' 'af' 'turneringen' '.' '[UNK]' '[UNK]' 'bidrog'


In [None]:
import os

# Set up a logs directory, so Tensorboard knows where to look for files.

ll = transformer_model.layers[1]
ll_weights = ll.get_weights()[0]

print(ll_weights.shape)
ll_weights


(4600, 128)


array([[ 0.0205 , -0.00607,  0.02066, ..., -0.00527,  0.01789,  0.02542],
       [-0.05354,  0.03176,  0.01388, ...,  0.0503 ,  0.02445, -0.02263],
       [-0.01538, -0.0594 ,  0.03463, ...,  0.03583, -0.0409 , -0.0322 ],
       ...,
       [ 0.03061,  0.03873, -0.01556, ..., -0.04649, -0.02569, -0.01839],
       [ 0.01878, -0.03674, -0.01462, ..., -0.03202,  0.01285, -0.04135],
       [-0.005  , -0.03103,  0.01423, ...,  0.04772,  0.01165,  0.00259]],
      dtype=float32)

In [None]:
##import I/O module in python
import io

##open the text stream for vectors
vectors = io.open('vectors.tsv', 'w', encoding='utf-8')

##open the text stream for metadata
meta = io.open('meta.tsv', 'w', encoding='utf-8')


##write each word and its corresponding embedding
for index in range(1, len(vect_vocab)):
  word = vect_vocab[index]  # flipping the key-value in word_index
  embeddings = ll_weights[index]
  meta.write(word + "\n")
  vectors.write('\t'.join([str(x) for x in embeddings]) + "\n")

##close the stream
vectors.close()
meta.close()

In [None]:
# from nltk import collocations
# bigram_measures = collocations.BigramAssocMeasures()
# finder = collocations.BigramCollocationFinder.from_words(["New", "York", "is", "big", "New", "York", "is", "dirty"])
# finder.ngram_fd.items()



In [None]:
# import lemmy
# # Create an instance of the standalone lemmatizer.
# lemmatizer = lemmy.load("da")

# # Find lemma for the word 'akvariernes'. First argument is an empty POS tag.
# lemmatizer.lemmatize("NOUN", "storsejr")



In [None]:
# import nltk as nltk
# # from string import punctuation
# # from nltk.corpus import stopwords
# # nltk.download('stopwords')

# # da_stopwords = stopwords.words("danish")


In [None]:
# A string input
inputs = tf.keras.Input(shape=(1,), dtype="string")
# Turn strings into vocab indices
indices = vectorize_layer(inputs)
# Turn vocab indices into predictions
outputs = transformer_model(indices)

# Our end to end model
end_to_end_model = tf.keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="transformer_crossentropy", optimizer="adam", metrics=["accuracy"]
)


In [None]:


print("\nResults:")

print(end_to_end_model.predict(
    [
      "Fodbold Fjerritslev vinder over Vordingborg. Træner kommenterer på historisk kamp",
      "Skisport Buller overrasker alle og gør det umulige. Dermed endnu en medalje til Norge",
     ]))

print("\n NON-Results:") 
print(end_to_end_model.predict(
    [
      "Fodbold Træner for Fjerritslev ser frem til sejr over Vordingborg. 'Det bliver en historisk kamp'",
      "Fodbold Flere forventer at Fjerritslev vinder over Vordingborg. 'Fjerritslev vinder!', udtaler hjemmeholdets træner",
      "Skisport Sverige drømmer om flere medaljer og sejre til næste års OL. Træner forventer flere rekorder",
      "Skisport Buller vil overraske alle og gøre det umulige. Vil have medalje til Norge",
     ]))


Results:
[[0.98809]
 [0.96642]]

 NON-Results:
[[0.50655]
 [0.43807]
 [0.00503]
 [0.50664]]
