In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from collections import OrderedDict
def get_sports():
    df_sport_latest = pd.read_csv('articles/sports_articles.csv', encoding = "ISO-8859-1")
    df_sport_latest_tv2 = pd.read_csv('articles/sports_articles_tv2.csv', encoding = "ISO-8859-1")
    df_sport_2019 = pd.read_csv('articles/sports_articles_2019.csv', encoding = "ISO-8859-1")
    df_sport_2020 = pd.read_csv('articles/sports_articles_2020.csv', encoding = "ISO-8859-1")
    df_sport_2022 = pd.read_csv('articles/sports_articles_2022.csv', encoding = "ISO-8859-1")
    df = pd.concat([df_sport_latest, df_sport_latest_tv2, df_sport_2019, df_sport_2020, df_sport_2022])
    df = df.sample(frac=1).reset_index(drop=True)
    return df


In [161]:
def vocab_2_pdset(columns, df):
    df_vocab_select_columns = df.iloc[:, columns]
    vocab_all_values = df_vocab_select_columns.values.ravel()
    return set(vocab_all_values)

def vocab_2_dict(sets):
    assert(len(sets) == 4)
    word_set = sets[0].union(sets[1],sets[2], sets[3])
    df = pd.DataFrame(list(word_set), columns=["Words"])
    df.sort_values(by="Words", inplace=True)
    df.reset_index(drop=True, inplace=True)
    return OrderedDict.fromkeys(word_set)

def get_nationalities_list():
    df_nationalities = pd.read_csv('nat2.csv', encoding = "ISO-8859-1", header=None)
    nationalities = df_nationalities.fillna('').iloc[:,:].values.ravel().tolist()
    return [x for x in nationalities if x!= '']


def get_vocab_dict():
    df_ods_vocab = pd.read_table('ods_fullforms_2020-08-26.csv', header=None)
    df_ddo_vocab = pd.read_table('ddo_fullforms_2020-08-26.csv', header=None)
    df_vocab = pd.read_table('cor1.02.tsv', header=None)
    df_sport_lingo = pd.read_table('sport_lingo.csv', header=None)

    vocab_set = vocab_2_pdset([1,3], df_vocab)
    ods_vocab_set = vocab_2_pdset([0,1], df_ods_vocab)
    ddo_vocab_set = vocab_2_pdset([0,1], df_ddo_vocab)
    sport_lingo_set = vocab_2_pdset([0], df_sport_lingo)

    d = vocab_2_dict([vocab_set, ods_vocab_set, ddo_vocab_set, sport_lingo_set])
    d = {key.lower() if isinstance(key, str) else key: value for key, value in d.items()}


    # del d['dansker']
    # del d['danskerne']
    # del d['danske']
    # del d['dansk']
    # del d['danmark']
    # del d['danmarks']
    return d

ordered_dict = get_vocab_dict()

nationalities = get_nationalities_list()



In [105]:
df_sport = get_sports()

df_sport_combined = df_sport.copy().drop('Link', axis=1)
df_sport_combined.to_csv('articles_temp/combined.csv')


duplicate_rows = df_sport.duplicated()

print(df_sport[duplicate_rows])




Empty DataFrame
Columns: [Category, Headline, SubHeading, Link, isResult, isMaybe]
Index: []


In [155]:
# nationalities = pd.read_csv('nat2.csv', encoding = "ISO-8859-1", header=None)

# c = [0,5]

# nationalities = nationalities.fillna('').iloc[:,:].values.ravel().tolist()
# nationalities = [x for x in nationalities if x!= '']

# for n in nationalities:
#     print(n)



Afghanistan
afghansk
pashto og dari
afghaner
Albanien
albansk
albansk
albaner
Algeriet
algerisk
arabisk og fransk
algerier
Andorra
andorransk
catalansk
andorraner
Angola
angolansk
portugisisk
angolaner
Antigua og Barbuda
engelsk
Argentina
argentinsk
spansk
argentiner
Armenien
armensk
armensk
armenier
Aserbajdsjan
aserbajdsjansk
aserbajdsjansk
aserbajdsjaner
Australien
australsk
engelsk
australier
Bahamas
bahamansk
engelsk
bahamaner
Bahrain
bahrainsk
arabisk
bahrainer
Bangladesh
Indtil 1971: Østpakistan
bangladeshisk
bengali
bangladesher
Barbados
barbadisk
engelsk
barbadier
Belgien
belgisk
flamsk, fransk og tysk
belgier
Belize
belizisk
engelsk
belizer
Benin
beninsk
fransk
beniner
Bermuda
engelsk
Bhutan
bhutanske
dzongkha
bhutaner
Bolivia
boliviansk
spansk og quechua
bolivianer
Bosnien og Hercegovina
bosnisk
bosnisk, kroatisk og serbokroatisk
bosnier
Botswana
botswansk
botswaner
Brasilien
brasiliansk
portugisisk
brasilianer
Brunei
bruneisk
bruneier
Bulgarien
bulgarsk
bulgarer
Burkina Fas

In [19]:
import time
isin_dict = False
def test_lookup_performance():
    word_to_check = "Dansk"
    start_time = time.time()

    for x in range(1000000):
        isin_dict = word_to_check in ordered_dict

    end_time = time.time()  
    assert(end_time - start_time < 1)
    print(isin_dict)

test_lookup_performance()

# isin_dict


False


In [240]:
import re

def split_specials(word):
    words_new = []
    parts = re.findall(r"[A-ZÆØÅa-zæøå0-9]+|\S", word)
    words_new.extend([x for x in parts])
    return words_new

def contains_non_alphanumeric(word):
    return bool(re.search(r'[^a-zæøåA-ZÆØÅ0-9]', word))
    
def split_sentences(sentences):
    words_arr = []
    for ind, sentence in enumerate(sentences):
        sentence_trimmed = sentence.strip()
        words = sentence_trimmed.split()
        for word in words:
            w = split_specials(word)
            words_arr.extend([x.lower() for x in w])
    return words_arr


def remove_duplicates(words):
    return list(set(words))

def remove_nationalities(words, nationalities):
    words_minus_nationalities = []
    for w in words:
        for n in nationalities:
            if w.startswith(n.lower()) == False:
                words_minus_nationalities.append(w)
            else:
                print(w)
    
    print(len(words))
    return set(words_minus_nationalities)

    
# def filter_nationalities(nationalities_list, dict):
#     to_remove = []
#     for n in nationalities_list:
#         for itemA in dict:
#             if itemA.startswith(n):
#                 to_remove.append(itemA)
#     return to_remove





In [241]:
train_text = df_sport.iloc[:, [0,1,2]].apply(' . '.join, axis=1).replace('\xa0', '', regex=True).to_numpy()

words_arr = split_sentences(train_text)
words_arr = remove_duplicates(words_arr)


words_arr_2 = remove_nationalities(words_arr, nationalities)
# 
s = list(words_arr_2)
s.sort()
print(len(s))


# # words_sport_unique_list = list(words_sport_unique)
# words_sport_lingo = []
# words_train_vocab = []

# # TODO : brug tensorflow Tokenezier til at omdanne ord til tokens
# # TODO : søg i alle leksikoner, søg med og uden bindestreg
# # TODO : håndter tal ikke i ordbøger eks ( x-x eller x-årig)
# # TODO : lemmatizer : udelad bøjninger af samme navneord. eks : verdensmester/verdensmesteren
# # TODO : evt. grupper ord der ofte hænger sammen med nltk BigramFinder. eks vandt over

# for w in range(len(words_arr)):
#     word = words_arr[w]
#     if any(char.isdigit() for char in word):
#         words_train_vocab.append(word)
#     if contains_non_alphanumeric(word):
#         words_train_vocab.append(word)
#     else: 
#         isin_dict = word in ordered_dict
#         if (isin_dict == False):
#             words_sport_lingo.append(word)
#         else:
#             words_train_vocab.append(word)


# print("total unique words:", len(words_arr) )
# print("total sports lingo words:", len(words_sport_lingo) )
# print("total vocab:", len(words_train_vocab))
# print("total articles:", len(df_sport) )

canadisk
islandske
islandske
tyskere
tyskere
tyskere
tyskere
tyskere
tjekker
kineserne
israeler
israeler
færing
fransk
fransk
fransk
fransk
fransk
serbisk
letteste
tyske
tyske
tyske
tyske
franske
franske
franske
franske
franske
danmarksmesteren
perus
thailand
ukrainsk
argentinske
etiopisk
portugal
qatarske
qatarske
kroatisk
tyrkiet
russere
slovak
russisk
russisk
svenskerne
svenskerne
svenskerne
argentinas
nordkorea
danmarksmesterskab
cypern
hviderussiske
islandsk
islandsk
australier
italienske
italienske
italienske
italienske
ukrainers
ukrainers
finske
makedonske
schweizer
schweizer
polen
marokko
finlands
brasilianer
kenyanerne
kenyanerne
kenyaneren
kenyaneren
argentinsk
japansk
japansk
japansk
argentina
danskers
danskers
danskers
svensker
svensker
svensker
polakken
israels
canadiske
mexicanske
kasakhstan
usain
svenske
svenske
australien
colombianske
colombianske
usada
ungarske
polsk
polsk
canada
israelsk
israelsk
belgier
inder
amerikanske
danske
danske
sverige
holland
danmark
serberen

In [168]:
contains_non_alphanumeric("fdsf,d")

True

In [21]:
# def remove_stopwords(word):
#     if word in da_stopwords:
#         return True
#     else:
#         return False

# words_train_vocab = [x for x in words_train_vocab if not remove_stopwords(x)]

# words_train_vocab


In [22]:
# from collections import Counter
# cnt = Counter()
# for word in words_train_vocab:
#    cnt[word] += 1

# s = sorted(cnt.items(), key=lambda item: item[1])
# s.reverse()

# s


In [23]:
words_sport_lingo
file = open('words_sport_lingo.txt','w')
for item in words_sport_lingo:
	file.write(item+"\n")
file.close()


file = open('words_train_vocab.txt','w')
for item in sorted(words_train_vocab):
	file.write(item+"\n")
file.close()

In [24]:
def format_2_bool(x):
    if type(x) == bool:
        return x
    assert(type(x) == str)
    x_copy = x
    x_copy = x_copy.strip()
    x_copy = x_copy.lower()
    assert(x_copy == "true" or x_copy == "false")
    if x_copy == "true":
        return True
    else:
        return False

In [25]:

df_sport_labels = df_sport['isResult'].apply(lambda x: format_2_bool(x))

results_true = df_sport_labels.loc[df_sport_labels== True]
results_false = df_sport_labels.loc[df_sport_labels == False]

assert(len(results_true) + len(results_false) == len(df_sport_labels))

print(len(results_true))
print(len(results_false))
labels = df_sport_labels.to_numpy().astype(int)

534
562


In [26]:
print(len(labels))
print(len(train_text))
print(labels.shape)
print(train_text.shape)
longest_text = len(max(train_text, key=len))
print("longest text: ", longest_text)
for t in range(len(train_text)):
    if labels[t] == 0:
        print(train_text[t])
        print(labels[t], "\n")

1096
1096
(1096,)
(1096,)
longest text:  360
FORMEL 1  . Derfor vinder Verstappen VM:Magnussen, mekanikeren og chefen om suveræn superstjerne  . Max Verstappen er i Singapore favorit til sin 11. sejr i træk, og allerede næste weekend kan han sikre sig VM-titlen 
0 

E-sport  . Tale fra tårevædet dansk stjerne går viralt . Den danske CS:GO-stjerne Casper "cadiaN" Møller hyldes af ekspert for at sætte sig selv på spil foran sine holdkammerater 
0 

Håndbold  . Træner og ekspert er enige: KIF er et mandskab i krise . Sidste sæsons slutspil virker som lang tid siden for den danske traditionsklub 
0 

BASKETBALL  . 'Borgmesteren af Randers' elsker basketlivet i Kronjylland:'Jeg spiller for byen og ikke for pengene'  . Basketikonet Chris Nielsen nyder at have kultstatus i hjembyen Randers 
0 

HERRELANDSHOLDET  . 'Han har et utroligt topniveau':Dårligdommene har endelig forladt Dolberg  . Kasper Dolberg har fundet fodboldlykken belgiske Anderlecht, hvilket har sendt ham tilbage i landsholdsl

GOLF  . Emily Pedersen vil forsvare de europæiske farver med hjerte og sjæl  . Den danske golfspiller Emily Pedersen er med på et europæiske hold, som fredag tager hul på Solheim Cup-turneringen, hvor Europa møder USA 
0 

BASKETBALL  . Dansk basketstjerne åbner pointkontoen i USA, og det kan starte 'en lavine' af opmærksomhed  . Iffe Lundberg scorede i nat sine første point i NBA. Overgangen for basketballspilleren og hans familie har være god efter en hektisk periode i Rusland 
0 

Fodbold  . Sportsdirektør stopper i Sønderjyske . Sønderjyske skal finde en ny sportsdirektør, når Esben Hansens kontrakt udløber 30. november
0 

SPORT  . Når Mark går ind i kampburet i nat, har han Maria og børnene med i tankerne  . Den tidligere, olympiske bryder og sølvvinder Mark O. Madsen tager hul på sit tredje år som fuldtids mma-kæmper ved et stort event i Las Vegas 
0 

OL PARIS 2024  . Dansk løbestjerne rejste til Tokyo uden at vide, om hun kunne løbe  . Anna Emilie Møller har kæmpet med skader 

In [28]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

# TODO : evt indikere hvilke navneord der starte med stort bogstav(egenavne), evt. lave et opslag for at undersøge ordklasse for det første ord i sætningen 
# TODO : adskil og inkluder punktuering (,.""?) 

def to_lower(word):
    return tf.strings.lower(word, encoding='utf-8')

def split_specials(input_data):
    fd = [":" , "/", ":", ",", "'", ".", "?", "-", "!", "(", ")", '"']

    new_str = input_data
    for sign in fd:
        r = "\\" + sign
        new_str = tf.strings.regex_replace(new_str, pattern=r, rewrite=" " + sign + " ")

    return new_str

def replace_digits(word):
    return tf.strings.regex_replace(word, pattern=r'\d+', rewrite=r'X')


def custom_standardization(input_data):
    lowercase = to_lower(input_data)
    replaced_digits = replace_digits(lowercase)
    return split_specials(replaced_digits)


# Model constants.
max_features = 4700
sequence_length = 100

# Now that we have our custom standardization, we can instantiate our text
# vectorization layer. We are using this layer to normalize, split, and map
# strings to integers, so we set our 'output_mode' to 'int'.
# Note that we're using the default split function,
# and the custom standardization defined above.
# We also set an explicit maximum sequence length, since the CNNs later in our
# model won't support ragged sequences.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)


text_ds = vectorize_layer.adapt(words_train_vocab)
vect_vocab = vectorize_layer.get_vocabulary()

def vect_layer_2_text(vect_l):
    return np.array([vect_vocab[x] for x in np.trim_zeros(np.squeeze(vect_l.numpy()))])

print("Total vocab/max_features : ",  len(vect_vocab))




Total vocab/max_features :  4666


In [29]:
for t in train_text[0:10]:
    print("Original \n:", t)
    print("vect_2_text: \n", vect_layer_2_text(vectorize_layer([t])))
    print("\n")


Original 
: SPORT  . VIDEO Dansk paraatlet sætter verdensrekord i længdespring  . Daniel Wagner Jørgensen sprang 6,51 og forberede verdensrekorden med en centimeter 
vect_2_text: 
 ['sport' '.' 'video' '[UNK]' 'paraatlet' 'sætter' 'verdensrekord' 'i'
 'længdespring' '.' '[UNK]' '[UNK]' '[UNK]' 'sprang' 'X' ',' 'X' 'og'
 'forberede' 'verdensrekorden' 'med' 'en' 'centimeter']


Original 
: VOLLEYBALL  . Pokalguld til Gentofte og Holte i volleyball  . Hos herrerne trak Gentofte Volley sig sejrrigt ud af finalen, mens Holte IF tog titlen hos kvinderne 
vect_2_text: 
 ['volleyball' '.' 'pokalguld' 'til' '[UNK]' 'og' 'holte' 'i' 'volleyball'
 '.' 'hos' 'herrerne' 'trak' '[UNK]' 'volley' 'sig' 'sejrrigt' 'ud' 'af'
 'finalen' ',' 'mens' 'holte' 'if' 'tog' 'titlen' 'hos' 'kvinderne']


Original 
: VOLLEYBALL  . Brøndbys volleykvinder vinder pokalfinale mod rivaler  . For femte år i træk stod Brøndby og Holte overfor hinanden i pokalfinalen i volleyball 
vect_2_text: 
 ['volleyball' '.' '[UNK]' 

In [30]:
def split_data(data, labels, percentage):
    l = len(data)
    p = l - int((percentage/100) * l)
    return (data[0:p], data[p:], labels[0:p], labels[p:])



train_data, val_data, train_labels, val_labels = split_data(vectorize_layer(train_text), labels, 8)

print("Total data points", len(train_text))
print("Train data length", len(train_data))
print("Validation data length", len(val_data))


Total data points 1096
Train data length 1009
Validation data length 87


In [31]:
class TransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="relu"), keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions



In [32]:
from tensorflow.keras import layers
import random as python_random


def get_transformer_model():

    embed_dim = 128  # Embedding size for each token
    num_heads = 2  # Number of attention heads
    ff_dim = 128  # Hidden layer size in feed forward network inside transformer


    # A integer input for vocab indices.
    inputs = tf.keras.Input(shape=(sequence_length,), dtype="int64")

    # Next, we add a layer to map those vocab indices into a space of dimensionality
    # 'embedding_dim'.
    # x = layers.Embedding(max_features, embed_dim)(inputs)

    embedding_layer = TokenAndPositionEmbedding(sequence_length, max_features, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)

    # x = layers.Dropout(0.1)(x)

    # Conv1D + global max pooling
    # x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)
    # x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)


    x = layers.GlobalMaxPooling1D()(x)

    # We add a vanilla hidden layer:
    # x = layers.Dense(128, activation="relu")(x)
    # x = layers.Dropout(0.5)(x)

    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)


    transformer_model = tf.keras.Model(inputs, predictions)

    # Compile the model with binary crossentropy loss and an adam optimizer.
    transformer_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return transformer_model


In [52]:
from tensorflow.keras import layers
import random as python_random

def get_cnn_model():

    embedding_dim = 96

    # A integer input for vocab indices.
    inputs = tf.keras.Input(shape=(None,), dtype="int64")

    # Next, we add a layer to map those vocab indices into a space of dimensionality
    # 'embedding_dim'.
    x = layers.Embedding(max_features, embedding_dim)(inputs)
    x = layers.Dropout(0.5)(x)

    # Conv1D + global max pooling
    x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)
    x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)


    x = layers.GlobalMaxPooling1D()(x)

    # We add a vanilla hidden layer:
    x = layers.Dense(128, activation="relu")(x)
    # x = layers.Dropout(0.5)(x)

    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)
    cnn_model = tf.keras.Model(inputs, predictions)

    # Compile the model with binary crossentropy loss and an adam optimizer.
    cnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return cnn_model



In [34]:

def prepare_model(name):
    if (name == "cnn"):
       return get_cnn_model()
    elif (name == "transformer"):
       return get_transformer_model()
  

def filter_max_accuracy(history, threshold = 0.95):
    acc = history.history["accuracy"]
    val_acc = history.history["val_accuracy"]
    list = []
    for x in range(len(acc)):
        if (acc[x] > threshold):
            list.append(val_acc[x])

    return np.array(list)

models = ["cnn", "transformer"]


def mean_model_accuracy(mode_names, iterations, epochs = 20):

    callback_3_loss = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=4)

    results = []

    for name in range(len(mode_names)):
        model_name = mode_names[name]
        val_accuracies = []
        
        for x in range(iterations):
            model = prepare_model(model_name)

            # Fit the model using the train and test datasets.
            history = model.fit(train_data, train_labels, epochs=epochs, batch_size=6, validation_data=(val_data, val_labels), callbacks=[callback_3_loss])

            max_val_acc = filter_max_accuracy(history)
            val_accuracies.append(max(max_val_acc))
            print(max(max_val_acc))
            print(val_accuracies)
        
        d = dict(name = model_name, results = np.mean(np.squeeze(np.array(val_accuracies))))
        results.append(d)
        
    return results


In [35]:
mean_results = mean_model_accuracy(models, 10)
mean_results

Epoch 1/20


2023-10-13 13:17:51.569392: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2023-10-13 13:17:51.570625: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3791995000 Hz
2023-10-13 13:17:52.050711: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2023-10-13 13:17:54.144562: I tensorflow/stream_executor/cuda/cuda_dnn.cc:359] Loaded cuDNN version 8204
2023-10-13 13:17:56.724600: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2023-10-13 13:17:59.187329: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2023-10-13 13:17:59.230922: I tensorflow/stream_executor/cuda/cuda_blas.cc:1838] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
0.8850574493408203
[0.8850574493408203]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
0.8850574493408203
[0.8850574493408203, 0.8850574493408203]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
0.8620689511299133
[0.8850574493408203, 0.8850574493408203, 0.8620689511299133]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
0.8965517282485962
[0.8850574493408203, 0.8850574493408203, 0.8620689511299133, 0.8965517282485962]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
0.8620689511299133
[0.8850574493408203, 0.8850574493408203, 0.8620689511299133, 0.8965517282485962, 0.8620689511299133]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
0.8965517282485962
[0.8850574493408203, 0.8850574493408203, 0.8620689511299133, 0.8965517282485962, 0.8620689511299133, 0.8965517282485962]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/2

[{'name': 'cnn', 'results': 0.8793103396892548},
 {'name': 'transformer', 'results': 0.8781609177589417}]

In [36]:
def result_format_round(result):
    return round(result)

def result_format_none(result):
    return result

def print_model_score(model):
    score = model.evaluate(val_data, val_labels, verbose=0)
    print("Validation loss:", score[0])
    print("Validations accuracy:", score[1])

def print_validation_results(predictions, val_data, labels, formatter):
    print("Number of predictions", len(predictions))
    n_correct = 0
    for x in range(len(val_data)):
        print("VALIDATION SAMPLE: \n" ,vect_layer_2_text(val_data[x]))
        print("LABEL --:" , formatter(labels[x]), " ---- float: ", labels[x])
        print("PREDICTION --:" , formatter(predictions[x][0]), " ---- float: ", predictions[x][0])
        print("\n")
        if result_format_round(labels[x]) == result_format_round(predictions[x][0]):
            n_correct += 1
    
    print("Number correct: ", n_correct)

In [54]:

epochs= 7
transformer_model = get_transformer_model()

# Fit the model using the train and test datasets.
transformer_history = transformer_model.fit(train_data, train_labels, epochs=epochs, batch_size=6, validation_data=(val_data, val_labels))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [53]:

epochs= 5
cnn_model = get_cnn_model()

# # Fit the model using the train and test datasets.
transformer_history = cnn_model.fit(train_data, train_labels, epochs=epochs, batch_size=6, validation_data=(val_data, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [39]:
def print_results(model):
    np.set_printoptions(precision = 5, suppress = True)
    predictions = model.predict(val_data)
    print_model_score(model)
    print("\n")
    print_validation_results(predictions, val_data, val_labels, result_format_round)
  

In [40]:
print("--- TRANSFORMER ---")
print_results(transformer_model)

--- TRANSFORMER ---
Validation loss: 0.4014548063278198
Validations accuracy: 0.8620689511299133


Number of predictions 87
VALIDATION SAMPLE: 
 ['[UNK]' 'X' '.' 'selvom' 'videnskaben' 'mangler' 'svar' ',' 'får'
 'mystisk' 'fænomen' 'bobslædekørere' 'til' 'at' "'" 'gå' 'med' 'livrem'
 'og' 'seler' "'" '.' 'hvis' 'slædesportsatleter' 'træner' 'for' 'meget'
 'og' 'glemmer' 'at' 'restituere' ',' 'kan' 'de' 'opleve' 'det' ',' 'der'
 'i' 'sporten' 'har' 'fået' 'navnet' '[UNK]']
LABEL --: 0  ---- float:  0
PREDICTION --: 0  ---- float:  0.0047289818


VALIDATION SAMPLE: 
 ['basketball' '.' "'" 'det' 'er' 'et' 'åndssvagt' 'resultat' "'" ':'
 '[UNK]' 'basketlandshold' 'slår' 'stormagt' '.' '[UNK]' 'slog' 'litauen'
 'med' 'X' '-' 'X' 'i' 'kvalifikationen' 'til' 'europamesterskabet']
LABEL --: 1  ---- float:  1
PREDICTION --: 1  ---- float:  0.99352276


VALIDATION SAMPLE: 
 ['champions' 'league' '.' 'hvad' 'foregår' 'der' 'i' '[UNK]' '?' 'de'
 'kom' 'med' 'på' 'et' 'wildcard' ',' 'og' 'nu' 'top

In [41]:

print("--- CNN ---")
print_results(cnn_model)

--- CNN ---


Validation loss: 1.0002483129501343
Validations accuracy: 0.8275862336158752


Number of predictions 87
VALIDATION SAMPLE: 
 ['[UNK]' 'X' '.' 'selvom' 'videnskaben' 'mangler' 'svar' ',' 'får'
 'mystisk' 'fænomen' 'bobslædekørere' 'til' 'at' "'" 'gå' 'med' 'livrem'
 'og' 'seler' "'" '.' 'hvis' 'slædesportsatleter' 'træner' 'for' 'meget'
 'og' 'glemmer' 'at' 'restituere' ',' 'kan' 'de' 'opleve' 'det' ',' 'der'
 'i' 'sporten' 'har' 'fået' 'navnet' '[UNK]']
LABEL --: 0  ---- float:  0
PREDICTION --: 0  ---- float:  0.077677235


VALIDATION SAMPLE: 
 ['basketball' '.' "'" 'det' 'er' 'et' 'åndssvagt' 'resultat' "'" ':'
 '[UNK]' 'basketlandshold' 'slår' 'stormagt' '.' '[UNK]' 'slog' 'litauen'
 'med' 'X' '-' 'X' 'i' 'kvalifikationen' 'til' 'europamesterskabet']
LABEL --: 1  ---- float:  1
PREDICTION --: 1  ---- float:  0.9999399


VALIDATION SAMPLE: 
 ['champions' 'league' '.' 'hvad' 'foregår' 'der' 'i' '[UNK]' '?' 'de'
 'kom' 'med' 'på' 'et' 'wildcard' ',' 'og' 'nu' 'topper' 'de' 'hele'
 'bad

In [42]:
import os

# Set up a logs directory, so Tensorboard knows where to look for files.

ll = transformer_model.layers[1]
ll_weights = ll.get_weights()[0]

print(ll_weights.shape)
ll_weights


(4700, 128)


array([[-0.03921, -0.0498 , -0.01824, ..., -0.07028,  0.01972, -0.02321],
       [ 0.0173 ,  0.02489,  0.01297, ..., -0.0639 ,  0.03681,  0.02332],
       [-0.00462,  0.00266, -0.00486, ..., -0.03234, -0.0364 , -0.05027],
       ...,
       [-0.02017,  0.02726,  0.03792, ...,  0.02532,  0.00028,  0.02812],
       [-0.02142,  0.00045,  0.03506, ..., -0.00284, -0.03621,  0.04751],
       [-0.02154,  0.02317,  0.03513, ..., -0.04795,  0.04726,  0.02887]],
      dtype=float32)

In [43]:
##import I/O module in python
import io

##open the text stream for vectors
vectors = io.open('vectors.tsv', 'w', encoding='utf-8')

##open the text stream for metadata
meta = io.open('meta.tsv', 'w', encoding='utf-8')


##write each word and its corresponding embedding
for index in range(1, len(vect_vocab)):
  word = vect_vocab[index]  # flipping the key-value in word_index
  embeddings = ll_weights[index]
  meta.write(word + "\n")
  vectors.write('\t'.join([str(x) for x in embeddings]) + "\n")

##close the stream
vectors.close()
meta.close()

In [44]:
# from nltk import collocations
# bigram_measures = collocations.BigramAssocMeasures()
# finder = collocations.BigramCollocationFinder.from_words(["New", "York", "is", "big", "New", "York", "is", "dirty"])
# finder.ngram_fd.items()



In [45]:
# import lemmy
# # Create an instance of the standalone lemmatizer.
# lemmatizer = lemmy.load("da")

# # Find lemma for the word 'akvariernes'. First argument is an empty POS tag.
# lemmatizer.lemmatize("NOUN", "storsejr")



In [46]:
# import nltk as nltk
# # from string import punctuation
# # from nltk.corpus import stopwords
# # nltk.download('stopwords')

# # da_stopwords = stopwords.words("danish")


In [47]:
# A string input
inputs = tf.keras.Input(shape=(1,), dtype="string")
# Turn strings into vocab indices
indices = vectorize_layer(inputs)
# Turn vocab indices into predictions
outputs = transformer_model(indices)

# Our end to end model
end_to_end_model = tf.keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="transformer_crossentropy", optimizer="adam", metrics=["accuracy"]
)


In [48]:


print("\nResults:")

print(end_to_end_model.predict(
    [
      "Fodbold Fjerritslev vinder over Vordingborg. Træner kommenterer på historisk kamp",
      "Skisport Buller overrasker alle og gør det umulige. Dermed endnu en medalje til Norge",
     ]))

print("\n NON-Results:") 
print(end_to_end_model.predict(
    [
      "Fodbold Træner for Fjerritslev ser frem til sejr over Vordingborg. 'Det bliver en historisk kamp'",
      "Fodbold Flere forventer at Fjerritslev vinder over Vordingborg. 'Fjerritslev vinder!', udtaler hjemmeholdets træner",
      "Skisport Sverige drømmer om flere medaljer og sejre til næste års OL. Træner forventer flere rekorder",
      "Skisport Buller vil overraske alle og gøre det umulige. Vil have medalje til Norge",
     ]))


Results:
[[0.83121]
 [0.85918]]

 NON-Results:
[[0.09982]
 [0.00839]
 [0.00456]
 [0.80245]]
