In [3314]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from collections import OrderedDict

def format_2_bool(x):
    if type(x) == bool:
        return x
    assert(type(x) == str)
    x_copy = x
    x_copy = x_copy.strip()
    x_copy = x_copy.lower()
    assert(x_copy == "true" or x_copy == "false")
    if x_copy == "true":
        return True
    else:
        return False

def get_sports():
    df_sport_latest = pd.read_csv('articles/sports_articles.csv', encoding = "ISO-8859-1")
    df_sport_latest_tv2 = pd.read_csv('articles/sports_articles_tv2.csv', encoding = "ISO-8859-1")
    df_sport_2019 = pd.read_csv('articles/sports_articles_2019.csv', encoding = "ISO-8859-1")
    df_sport_2020 = pd.read_csv('articles/sports_articles_2020.csv', encoding = "ISO-8859-1")
    df_sport_2022 = pd.read_csv('articles/sports_articles_2022.csv', encoding = "ISO-8859-1")
    df_sport_politiken = pd.read_csv('articles/sports_articles_politiken.csv', encoding = "ISO-8859-1")
    df = pd.concat([df_sport_latest, df_sport_latest_tv2, df_sport_2019, df_sport_2020, df_sport_2022, df_sport_politiken])
    df = df.sample(frac=1).reset_index(drop=True)
    df['isResult'] = df['isResult'].apply(lambda x: format_2_bool(x))
    return df


In [3315]:
def vocab_2_pdset(columns, df):
    df_vocab_select_columns = df.iloc[:, columns]
    vocab_all_values = df_vocab_select_columns.values.ravel()
    return set(vocab_all_values)

def vocab_2_dict(sets):
    assert(len(sets) == 2)
    word_set = sets[0].union(sets[1])
    df = pd.DataFrame(list(word_set), columns=["Words"])
    df.sort_values(by="Words", inplace=True)
    df.reset_index(drop=True, inplace=True)
    return OrderedDict.fromkeys(word_set)

def get_nationalities_list(csv_name):
    df_nationalities = pd.read_csv(csv_name, encoding = "ISO-8859-1", header=None)
    nationalities = df_nationalities.fillna('').iloc[:,:].values.ravel().tolist()
    return [x.lower() for x in nationalities if x!= '']


def get_vocab_dict():
    # df_ods_vocab = pd.read_table('ods_fullforms_2020-08-26.csv', header=None)
    df_ddo_vocab = pd.read_table('ddo_fullforms_2020-08-26.csv', header=None)
    # df_vocab = pd.read_table('cor1.02.tsv', header=None)
    df_sport_lingo = pd.read_table('sport_lingo.csv', header=None)

    # vocab_set = vocab_2_pdset([1,3], df_vocab)
    # ods_vocab_set = vocab_2_pdset([0,1], df_ods_vocab)
    ddo_vocab_set = vocab_2_pdset([0,1], df_ddo_vocab)
    sport_lingo_set = vocab_2_pdset([0], df_sport_lingo)

    d = vocab_2_dict([ddo_vocab_set, sport_lingo_set])
    d = {key.lower() if isinstance(key, str) else key: value for key, value in d.items()}

    return d


def extract_data(df):
    train_text = df.iloc[:, [0,1,2]].apply(' . '.join, axis=1).replace('\xa0', '', regex=True).to_numpy()
    labels = df['isResult'].to_numpy().astype(int)

    return train_text, labels

def split_data(data, percentage):
    train, labels = data
    l = len(train)
    p = l - int((percentage/100) * l)
    return (train[0:p], train[p:], labels[0:p], labels[p:])

def get_results_in_data(data, labels):
    results = []
    for n in range(len(data)):
        if labels[n] == 1:
            results.append(data[n])
    
    return results



ordered_dict = get_vocab_dict()
df_sport = get_sports()
nationalities = get_nationalities_list('nat3.csv')

countries = get_nationalities_list('countries.csv')
navne = get_nationalities_list('navne.csv')

# countries.extend(nationalities)

train_data, val_data, train_labels, val_labels = split_data(extract_data(df_sport), 7)

train_data_results = get_results_in_data(train_data, train_labels)

# print("Total data: ", len(train_text))
print("Train data length: ", len(train_data), len(train_labels))
print("Validation data length: ", len(val_data),  len(val_labels))



# TODO : create train, validation and test set here


Train data length:  1342 1342
Validation data length:  101 101


In [3316]:

def combine_articles_to_csv():
    df_sport_combined = df_sport.copy().drop('Link', axis=1)
    df_sport_combined.to_csv('articles_temp/combined.csv')

combine_articles_to_csv()


In [3317]:
duplicate_rows = df_sport.duplicated()
print("Duplicates in data points: ")
print(df_sport[duplicate_rows])

Duplicates in data points: 
Empty DataFrame
Columns: [Category, Headline, SubHeading, Link, isResult, isMaybe]
Index: []


In [3318]:
import time
isin_dict = False
def test_lookup_performance():
    word_to_check = "Dansk"
    start_time = time.time()

    for x in range(1000000):
        isin_dict = word_to_check in ordered_dict

    end_time = time.time()  
    assert(end_time - start_time < 1)
    print(isin_dict)

test_lookup_performance()

# isin_dict


False


In [3319]:
import re


def split_specials(word):
    words_new = []
    parts = re.findall(r"[A-ZÆØÅa-zæøå0-9]+|\S", word)
    words_new.extend([x for x in parts])
    return words_new

def contains_non_alphanumeric(word):
    return bool(re.search(r'[^a-zæøåA-ZÆØÅ0-9]', word))

def remove_numeric(words):
    return [x for x in words if any(char.isdigit() for char in x) == False]
    
def split_sentences(sentences):
    print(sentences)
    words_arr = []
    for ind, sentence in enumerate(sentences):
        sentence_trimmed = sentence.strip()
        words = sentence_trimmed.split()
        for word in words:
            w = split_specials(word)
            words_arr.extend([x.lower() for x in w])
    return words_arr


def remove_duplicates(words):
    return list(set(words))

def remove_nationalities(words, nationalities):
    words_minus_nationalities = []
    for w in words:
        result = any(w.startswith(item.lower()) for item in nationalities)
        if result == False:
            words_minus_nationalities.append(w)
    
    return list(set(words_minus_nationalities))

def remove_non_dict_words(words, dict):
    words_in_dict = []
    words_not_in_dict = []
    for w in words:
        isin_dict = w in dict
        if isin_dict == True:
            words_in_dict.append(w)
        else:
            words_not_in_dict.append(w)
     
    return words_in_dict, words_not_in_dict

def add_non_alpha_numeric(words, non_alpha):
    ws = words
    ws.extend(non_alpha)
    return ws


In [3320]:


# train_text = df_sport.iloc[:, [0,1,2]].apply(' . '.join, axis=1).replace('\xa0', '', regex=True).to_numpy()
# train_text_results = df_sport.loc[df_sport['isResult'] == True].iloc[:, [0,1,2]].apply(' . '.join, axis=1).replace('\xa0', '', regex=True).to_numpy()

print(len(train_data))

for d in df_sport['isResult']:
    assert(isinstance(d, bool) == True)


words_arr = split_sentences(train_data)
words_arr_unique = remove_duplicates(words_arr)
words_arr_unique = remove_nationalities(words_arr_unique, nationalities)
words_arr_unique = remove_numeric(words_arr_unique)

words_train_vocab, words_sport_lingo = remove_non_dict_words(words_arr_unique, ordered_dict)

non_alpha = [":" , "/", ":", ",", "'", ".", "?", "-", "!", "(", ")", '"']

words_train_vocab = add_non_alpha_numeric(words_train_vocab, non_alpha)


# # TODO : brug tensorflow Tokenezier til at omdanne ord til tokens
# # TODO : søg i alle leksikoner, søg med og uden bindestreg
# # TODO : håndter tal ikke i ordbøger eks ( x-x eller x-årig)
# # TODO : lemmatizer : udelad bøjninger af samme navneord. eks : verdensmester/verdensmesteren
# # TODO : evt. grupper ord der ofte hænger sammen med nltk BigramFinder. eks vandt over
# TODO : fjern evt. også alle navne (fornavne og efternavne)  

print("total unique words:", len(words_arr_unique) )
print("total sports lingo words:", len(words_sport_lingo) )
print("total vocab:", len(words_train_vocab))
print("total articles:", len(df_sport) )

# for d in df_sport['isResult']:
#     if isinstance(d, bool) != True:
#         print(d)

1342
['ENGELSK FODBOLD  . United forventer medaljer trods elendig start  . Viceformand Ed Woodward oplyser, at Manchester United budgetterer med en tredjeplads i Premier League '
 'FORMEL 1  . Det gik galt næsten lige fra start for Magnussen i Spanien . Max Verstappen vinder løbet foran holdkammeraten Sergio Perez og George Russell fra Mercedes '
 'BASKETBALL  . VIDEO Bakken Bears vinder mesterskab nummer 15 på 21 år  . Aarhusianerne var bagud med syv point inden sidste periode, men vandt guldet med en 83-76-sejr over Horsens '
 ...
 'Cykling  . Nu undskylder ryttere efter udelukkelse . Gerben Thijssen og Madis Mihkels undskylder deres handlinger, som fik dem smidt ud af kinesisk etapeløb '
 'Sport . IOC smider Rusland ud for at inddrage ukrainske forbund . Ruslands Olympiske Komité har forbrudt sig mod det olympiske charter og er foreløbig suspenderet '
 "BASKETBALL  . Nykåret NBA-mester underholder med glohed telefon og serbisk succes  . Basketballstjernen Nikola Jokic vandt NBA-mest

In [3321]:
file = open('words_sport_lingo.txt','w')
for item in words_sport_lingo:
	file.write(item+"\n")
file.close()

file = open('words_train_vocab.txt','w')
for item in sorted(words_train_vocab):
	file.write(item+"\n")
file.close()

In [3322]:
navne

['danmark',
 'league',
 'danmarks',
 'open',
 'beijing',
 'tour',
 'magnussen',
 'årige',
 'holger',
 'fc',
 'kasakhstan',
 'usa',
 'vingegaard',
 'manchester',
 'united',
 'kevin',
 'aalborg',
 'gog',
 'jonas',
 'hjulmand',
 'norge',
 'world',
 'france',
 'us',
 'nba',
 'verstappen',
 'jesper',
 'dbu',
 'brøndby',
 'visma',
 'fck',
 'eriksen',
 'uefa',
 'dr',
 'tokyo',
 'tauson',
 'mma',
 'christian',
 'københavn',
 'odense',
 'nfl',
 'pedersen',
 'axelsen',
 'madsen',
 'ob',
 'esbjerg',
 'kasper',
 'finland',
 'woods',
 'andreas',
 'ryder',
 'rasmus',
 'højlund',
 'max',
 'roglic',
 'højgaard',
 'mahfoud',
 'bears',
 'vuelta',
 'sverige',
 'jensen',
 'spanien',
 'frankrig',
 'pga',
 'nielsen',
 'nicolai',
 'sarah',
 'europa',
 'lyngby',
 'bayern',
 'viktor',
 'shiffrin',
 'simon',
 'clara',
 'tottenham',
 'premier',
 'new',
 'katrine',
 'aarhus',
 'kjær',
 'iffe',
 'marie',
 'wozniacki',
 'anne',
 'jørgensen',
 'magnus',
 'peter',
 'saudi',
 'anders',
 'belgien',
 'mørkøv',
 'china',

In [3323]:

def most_frequent_words_in_arr(arr):
    xx = np.array(arr, dtype=object)
    unique, counts = np.unique(xx, return_counts=True)
    aa = np.asarray((unique, counts)).T
    return np.flip(aa[aa[:, 1].argsort()])

frequent_words = most_frequent_words_in_arr(words_arr)

# TODO : lav en negativ liste også
# display most frequent words found in lingo words
for f in frequent_words:
    if f[1] in words_sport_lingo and f[1] not in navne and len(f[1]) > 1:
        print(f[1])


terrororganisationhamasgør
terrororganisationenhamas
zaniolo
zaretska
zhu
alms
coronarestriktioner
zonic
zohore
antvorskovhallen
fredagensvenskabskampmod
amb
amusan
anastasia
åvall
hattrickhelt
golfsæsonen
bådklasser
budocnost
elming
edin
denis
bietigheim
enrique
erlings
ernie
esben
bekker
wejse
belarusiske
enna
bellinghams
vittoria
bernie
emilia
vittinghus
emile
bremen
delaney
enimponerendesæsonstart
vk
boksekommentator
bokselegenden
dp
diggs
draften
dgi
diamond
dideriksen
dream
dk
bokseverden
dina
duncan
vismas
bills
brawn
bissouma
braut
detlegendariskeboston
detroit
vinjebo
brandon
dana
dahm
wolfpack
dak
dalby
armando
cykelstjernen
auger
british
vladyslav
vettel
dc
ddbu
banelandsholdeter
wind
brugge
ballisager
brown
federation
vestergård
bromer
brock
wiggins
verstappens
danny
peng
nørklit
nepomniachtchi
niskanen
tilcykelløbetgiro
opens
oprykkerhold
sponsorklemme
slow
succesbølge
shuai
serrano
søndagensafgørendesejlads
slopestyle
jegtror
karateatlet
kerrtu
holdforfølgelse
hverkenjoha

In [3324]:


# df_sport_labels = df_sport['isResult']

# results_true = df_sport_labels.loc[df_sport_labels== True]
# results_false = df_sport_labels.loc[df_sport_labels == False]

# assert(len(results_true) + len(results_false) == len(df_sport_labels))

# labels = df_sport_labels.to_numpy().astype(int)

# print("Labels True: " , len(results_true))
# print("Labels False: ", len(results_false))


In [3325]:
# assert (len(labels) == len(train_text))
# print("Data:")
# for t in range(len(train_text)):
#     if labels[t] == 0:
#         print(train_text[t])
#         print(labels[t], "\n")

In [3326]:
# tournaments = ["tour de france", "us open", "nfl", "dm", "vm", "em", "vuelta", "vueltaen", "vueltaen", "champions league", "ol"]

tournaments = {
  "tour de france": "tour-de-france",
  "us open": "us-open",
  "nfl" : "nfl",
  "dm" : "dm",
  "vm" : "vm",
  "em" : "em",
  "vuelta" : "vuelta",
  "vueltaen" : "vuelta",
  "ryder cup" : "ryder-cup",
  "champions league" :  "champions-league",
  "ol": "ol",
  "nba" :"nba",
  "Vuelta a España" : "vuelta",
  "superliga" : "superliga"
}


for t in tournaments:
    print(tournaments[t])

tour-de-france
us-open
nfl
dm
vm
em
vuelta
vuelta
ryder-cup
champions-league
ol
nba
vuelta
superliga


In [3327]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

# TODO : evt indikere hvilke navneord der starte med stort bogstav(egenavne), evt. lave et opslag for at undersøge ordklasse for det første ord i sætningen 

def to_lower(word):
    return tf.strings.lower(word, encoding='utf-8')

def split_specials(input_data):

    new_str = input_data
    for sign in non_alpha:
        r = "\\" + sign
        new_str = tf.strings.regex_replace(new_str, pattern=r, rewrite=" " + sign + " ")

    return new_str

def replace_digits(word):
    return tf.strings.regex_replace(word, pattern=r'\d+', rewrite=r'xx')

def replace_nationality(word):

    new_str = word
    for sign in countries:
        r = "\\b" + sign + "s?\\b"
        new_str = tf.strings.regex_replace(new_str, pattern=r, rewrite="x_land")

    return new_str

def replace_tournament(word):

    new_str = word
    for sign in tournaments:
        r = "\\b" + sign

        new_str = tf.strings.regex_replace(new_str, pattern=r, rewrite="x_tournament")

    return new_str

def custom_standardization(input_data):
    l = to_lower(input_data)
    s = split_specials(l)
    t = replace_tournament(s)
    n = replace_nationality(t)
    return replace_digits(n)

def vect_layer_2_text(vect_l):
    return np.array([vect_vocab[x] for x in np.trim_zeros(np.squeeze(vect_l.numpy()))])

# Model constants.
max_features = 5300
sequence_length = 100

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)
def prepare_vocab(words):
    words_copy = words.copy()
    # add the word 'xx' to the allowed vocabulary representing all numbers
    words_copy.extend([
        "xx", 
        "x_land", "x-tournament"]
        )
    return words_copy


text_ds = vectorize_layer.adapt(prepare_vocab(words_train_vocab))
vect_vocab = vectorize_layer.get_vocabulary()

print("Total vocab/max_features : ",  len(vect_vocab))


Total vocab/max_features :  5214


In [3328]:
for t in train_data[0:50]:
    print("Original \n:", t)
    print("vect_2_text: \n", vect_layer_2_text(vectorize_layer([t])))
    print("\n")


Original 
: ENGELSK FODBOLD  . United forventer medaljer trods elendig start  . Viceformand Ed Woodward oplyser, at Manchester United budgetterer med en tredjeplads i Premier League 
vect_2_text: 
 ['engelsk' 'fodbold' '.' '[UNK]' 'forventer' 'medaljer' 'trods' 'elendig'
 'start' '.' 'viceformand' 'ed' '[UNK]' 'oplyser' ',' 'at' '[UNK]' '[UNK]'
 'budgetterer' 'med' 'en' 'tredjeplads' 'i' '[UNK]' '[UNK]']


Original 
: FORMEL 1  . Det gik galt næsten lige fra start for Magnussen i Spanien . Max Verstappen vinder løbet foran holdkammeraten Sergio Perez og George Russell fra Mercedes 
vect_2_text: 
 ['formel' 'xx' '.' 'det' 'gik' 'galt' 'næsten' 'lige' 'fra' 'start' 'for'
 '[UNK]' 'i' 'x_land' '.' '[UNK]' '[UNK]' 'vinder' 'løbet' 'foran'
 'holdkammeraten' '[UNK]' '[UNK]' 'og' '[UNK]' '[UNK]' 'fra' '[UNK]']


Original 
: BASKETBALL  . VIDEO Bakken Bears vinder mesterskab nummer 15 på 21 år  . Aarhusianerne var bagud med syv point inden sidste periode, men vandt guldet med en 83-76-sejr ove

In [3329]:
# def split_data(data, labels, percentage):
#     l = len(data)
#     p = l - int((percentage/100) * l)
#     return (data[0:p], data[p:], labels[0:p], labels[p:])


# train_data, val_data, train_labels, val_labels = split_data(vectorize_layer(train_text), labels, 7)

# print("Total data: ", len(train_text))
# print("Train data length: ", len(train_data))
# print("Validation data length: ", len(val_data))


In [3330]:
train_data = vectorize_layer(train_data)
val_data = vectorize_layer(val_data)



In [3331]:
class TransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="relu"), keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions



In [3332]:
from tensorflow.keras import layers
import random as python_random


def get_transformer_model():

    embed_dim =  256 # Embedding size for each token
    num_heads = 2  # Number of attention heads
    ff_dim = 256  # Hidden layer size in feed forward network inside transformer


    # A integer input for vocab indices.
    inputs = tf.keras.Input(shape=(sequence_length,), dtype="int64")

    # Next, we add a layer to map those vocab indices into a space of dimensionality
    # 'embedding_dim'.
    # x = layers.Embedding(max_features, embed_dim)(inputs)

    embedding_layer = TokenAndPositionEmbedding(sequence_length, max_features, embed_dim)
    x = embedding_layer(inputs)
    # x = layers.Dropout(0.2)(x)

    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)


    # Conv1D + global max pooling
    # x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)
    # x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)


    x = layers.GlobalMaxPooling1D()(x)

    # We add a vanilla hidden layer:
    # x = layers.Dense(32, activation="relu")(x)
    # x = layers.Dropout(0.5)(x)

    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)


    transformer_model = tf.keras.Model(inputs, predictions)

    # Compile the model with binary crossentropy loss and an adam optimizer.
    transformer_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return transformer_model


In [3333]:
from tensorflow.keras import layers
import random as python_random

def get_cnn_model():

    embedding_dim = 64

    # A integer input for vocab indices.
    inputs = tf.keras.Input(shape=(None,), dtype="int64")

    # Next, we add a layer to map those vocab indices into a space of dimensionality
    # 'embedding_dim'.
    x = layers.Embedding(max_features, embedding_dim)(inputs)
    x = layers.Dropout(0.2)(x)

    # Conv1D + global max pooling
    x = layers.Conv1D(256, 6, padding="valid", activation="relu", strides=2)(x)
    # x = layers.Conv1D(256, 6, padding="valid", activation="relu", strides=2)(x)
    # x = layers.Conv1D(256, 6, padding="valid", activation="relu", strides=2)(x)

    x = layers.Conv1D(96, 4, padding="valid", activation="relu", strides=2)(x)



    x = layers.GlobalMaxPooling1D()(x)

    # We add a vanilla hidden layer:
    # x = layers.Dense(128, activation="relu")(x)
    # x = layers.Dropout(0.5)(x)

    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)
    cnn_model = tf.keras.Model(inputs, predictions)

    # Compile the model with binary crossentropy loss and an adam optimizer.
    cnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return cnn_model



In [3334]:

def prepare_model(name):
    if (name == "cnn"):
       return get_cnn_model()
    elif (name == "transformer"):
       return get_transformer_model()
  

def filter_max_accuracy(history, threshold = 0.95):
    acc = history.history["accuracy"]
    val_acc = history.history["val_accuracy"]
    list = []
    for x in range(len(acc)):
        if (acc[x] > threshold):
            list.append(val_acc[x])

    return np.array(list)

models = ["cnn", "transformer"]

callback_3_loss = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=4)


def mean_model_accuracy(mode_names, iterations, epochs = 20):

  
    results = []

    for name in range(len(mode_names)):
        model_name = mode_names[name]
        val_accuracies = []
        
        for x in range(iterations):
            model = prepare_model(model_name)

            # Fit the model using the train and test datasets.
            history = model.fit(train_data, train_labels, epochs=epochs, batch_size=6, validation_data=(val_data, val_labels), callbacks=[callback_3_loss])

            max_val_acc = filter_max_accuracy(history)
            val_accuracies.append(max(max_val_acc))
            print(max(max_val_acc))
            print(val_accuracies)
        
        d = dict(name = model_name, results = np.mean(np.squeeze(np.array(val_accuracies))))
        results.append(d)
        
    return results


In [3335]:
# mean_results = mean_model_accuracy(models, 8)
# mean_results

In [3336]:
def result_format_round(result):
    return round(result)

def result_format_none(result):
    return result

def print_model_score(model):
    score = model.evaluate(val_data, val_labels, verbose=0)
    print("Validation loss:", score[0])
    print("Validations accuracy:", score[1])

def print_validation_results(predictions, val_data, labels, formatter):
    print("Number of predictions", len(predictions))
    n_correct = 0
    for x in range(len(val_data)):
        correct_prediction = result_format_round(labels[x]) == result_format_round(predictions[x][0])

        print("VALIDATION SAMPLE: \n" ,vect_layer_2_text(val_data[x]))
        print("LABEL --:" , formatter(labels[x]), " ---- float: ", labels[x])
        print("PREDICTION --:" , formatter(predictions[x][0]), " ---- float: ", predictions[x][0])
        print("CORRECT PREDICTION: ", correct_prediction)
        print("\n")

        if correct_prediction:
            n_correct += 1
    
    print("Number correct: ", n_correct)

In [3337]:

epochs= 15
transformer_model = get_transformer_model()

# Fit the model using the train and test datasets.
transformer_history = transformer_model.fit(train_data, train_labels, epochs=epochs, batch_size=12, validation_data=(val_data, val_labels), callbacks=[callback_3_loss])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15


In [3338]:

epochs= 12
cnn_model = get_cnn_model()


# # Fit the model using the train and test datasets.
transformer_history = cnn_model.fit(train_data, train_labels, epochs=epochs, batch_size=12, validation_data=(val_data, val_labels), callbacks=[callback_3_loss])

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12


In [3339]:
def print_results(model):
    np.set_printoptions(precision = 5, suppress = True)
    predictions = model.predict(val_data)
    print_model_score(model)
    print("\n")
    print_validation_results(predictions, val_data, val_labels, result_format_round)
  

In [3340]:
print("--- TRANSFORMER ---")
print_results(transformer_model)

--- TRANSFORMER ---


Validation loss: 0.44761815667152405
Validations accuracy: 0.8316831588745117


Number of predictions 101
VALIDATION SAMPLE: 
 ['golf' '.' '[UNK]' 'turnering' '[UNK]' 'i' '[UNK]' '[UNK]' '.' '[UNK]'
 'nummer' 'to' ',' '[UNK]' '[UNK]' ',' 'blev' 'sammen' 'med' 'fem' 'andre'
 'spillere' 'tirsdag' 'præsenteret' 'på' 'liv' '-' '[UNK]']
LABEL --: 0  ---- float:  0
PREDICTION --: 1  ---- float:  0.8955136
CORRECT PREDICTION:  False


VALIDATION SAMPLE: 
 ['vintersport' '.' 'efter' '[UNK]' ',' '[UNK]' 'og' 'seks' 'måneders'
 '[UNK]' 'er' 'x_tournament' '-' 'håb' 'atter' 'tilbage' 'i' '[UNK]' '.'
 '[UNK]' '[UNK]' 'due' '[UNK]' 'er' 'tilbage' 'på' '[UNK]' ',' 'efter'
 'han' 'for' 'næsten' 'seks' 'måneder' 'siden' 'fik' 'et' 'voldsomt'
 '[UNK]']
LABEL --: 0  ---- float:  0
PREDICTION --: 1  ---- float:  0.58647364
CORRECT PREDICTION:  False


VALIDATION SAMPLE: 
 ['x_tournament' 'håndbold' '.' '[UNK]' 'toft' 'stod' 'med' '[UNK]' '[UNK]'
 ':' "'" 'hun' 'er' 'verdens' 'bedste' "'" '.' '[UNK]' '[UN

In [3341]:

print("--- CNN ---")
print_results(cnn_model)

--- CNN ---
Validation loss: 0.5450266599655151
Validations accuracy: 0.8514851331710815


Number of predictions 101
VALIDATION SAMPLE: 
 ['golf' '.' '[UNK]' 'turnering' '[UNK]' 'i' '[UNK]' '[UNK]' '.' '[UNK]'
 'nummer' 'to' ',' '[UNK]' '[UNK]' ',' 'blev' 'sammen' 'med' 'fem' 'andre'
 'spillere' 'tirsdag' 'præsenteret' 'på' 'liv' '-' '[UNK]']
LABEL --: 0  ---- float:  0
PREDICTION --: 1  ---- float:  0.7763517
CORRECT PREDICTION:  False


VALIDATION SAMPLE: 
 ['vintersport' '.' 'efter' '[UNK]' ',' '[UNK]' 'og' 'seks' 'måneders'
 '[UNK]' 'er' 'x_tournament' '-' 'håb' 'atter' 'tilbage' 'i' '[UNK]' '.'
 '[UNK]' '[UNK]' 'due' '[UNK]' 'er' 'tilbage' 'på' '[UNK]' ',' 'efter'
 'han' 'for' 'næsten' 'seks' 'måneder' 'siden' 'fik' 'et' 'voldsomt'
 '[UNK]']
LABEL --: 0  ---- float:  0
PREDICTION --: 0  ---- float:  0.08791258
CORRECT PREDICTION:  True


VALIDATION SAMPLE: 
 ['x_tournament' 'håndbold' '.' '[UNK]' 'toft' 'stod' 'med' '[UNK]' '[UNK]'
 ':' "'" 'hun' 'er' 'verdens' 'bedste' "'" '.' '[

In [3342]:
import os

# Set up a logs directory, so Tensorboard knows where to look for files.

ll = transformer_model.layers[1]
ll_weights = ll.get_weights()[0]

print(ll_weights.shape)
ll_weights


(5300, 256)


array([[ 0.00117, -0.02511, -0.01521, ...,  0.03392,  0.00995,  0.01038],
       [-0.03994, -0.05774,  0.02139, ...,  0.05354, -0.00413,  0.03276],
       [ 0.03067, -0.02673, -0.01757, ...,  0.06012,  0.04105,  0.00839],
       ...,
       [ 0.02282, -0.02518, -0.00774, ..., -0.00166,  0.04671,  0.00121],
       [-0.03333, -0.02169,  0.00577, ..., -0.02115, -0.01015,  0.0078 ],
       [ 0.04483, -0.00748, -0.02482, ...,  0.04162, -0.03166, -0.01175]],
      dtype=float32)

In [3343]:
##import I/O module in python
import io

##open the text stream for vectors
vectors = io.open('vectors.tsv', 'w', encoding='utf-8')

##open the text stream for metadata
meta = io.open('meta.tsv', 'w', encoding='utf-8')


##write each word and its corresponding embedding
for index in range(1, len(vect_vocab)):
  word = vect_vocab[index]  # flipping the key-value in word_index
  embeddings = ll_weights[index]
  meta.write(word + "\n")
  vectors.write('\t'.join([str(x) for x in embeddings]) + "\n")

##close the stream
vectors.close()
meta.close()

In [3344]:
# from nltk import collocations
# bigram_measures = collocations.BigramAssocMeasures()
# finder = collocations.BigramCollocationFinder.from_words(["New", "York", "is", "big", "New", "York", "is", "dirty"])
# finder.ngram_fd.items()



In [3345]:
# import lemmy
# # Create an instance of the standalone lemmatizer.
# lemmatizer = lemmy.load("da")

# # Find lemma for the word 'akvariernes'. First argument is an empty POS tag.
# lemmatizer.lemmatize("NOUN", "storsejr")



In [3346]:
# import nltk as nltk
# # from string import punctuation
# # from nltk.corpus import stopwords
# # nltk.download('stopwords')

# # da_stopwords = stopwords.words("danish")


In [3347]:
# A string input
inputs = tf.keras.Input(shape=(1,), dtype="string")
# Turn strings into vocab indices
indices = vectorize_layer(inputs)
# Turn vocab indices into predictions
outputs = transformer_model(indices)

# Our end to end model
end_to_end_model = tf.keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="transformer_crossentropy", optimizer="adam", metrics=["accuracy"]
)


In [3348]:


results = [
      "FODBOLD . Frankrig vil vinde over Danmark. Det bliver en målfest"
]

print("\nResults:")
print(end_to_end_model.predict(
    [
      results[0]
     ]))

print("\n NON-Results:") 
print(end_to_end_model.predict(
    [
      "FODBOLD Karakterbogen: Skovs to scoringer skal honoreres, men Eriksens afleveringer var kampens højdepunkter. Karaktererne i EM-kvalifikationskampen mod Kasakhstan svinger fra to gange 02 til to gange 10"
     ]))


Results:
[[0.04792]]

 NON-Results:
[[0.76143]]


In [3349]:
vect_layer_2_text(vectorize_layer([results[0]]))

array(['fodbold', '.', 'x_land', 'vil', 'vinde', 'over', 'x_land', '.',
       'det', 'bliver', 'en', 'målfest'], dtype='<U7')