In [133]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from collections import OrderedDict
def get_sports():
    df_sport_latest = pd.read_csv('articles/sports_articles.csv', encoding = "ISO-8859-1")
    df_sport_latest_tv2 = pd.read_csv('articles/sports_articles_tv2.csv', encoding = "ISO-8859-1")
    df_sport_2019 = pd.read_csv('articles/sports_articles_2019.csv', encoding = "ISO-8859-1")
    df_sport_2020 = pd.read_csv('articles/sports_articles_2020.csv', encoding = "ISO-8859-1")
    df_sport_2022 = pd.read_csv('articles/sports_articles_2022.csv', encoding = "ISO-8859-1")
    df = pd.concat([df_sport_latest, df_sport_latest_tv2, df_sport_2019, df_sport_2020, df_sport_2022])
    df = df.sample(frac=1).reset_index(drop=True)
    return df


In [134]:
def vocab_2_pdset(columns, df):
    df_vocab_select_columns = df.iloc[:, columns]
    vocab_all_values = df_vocab_select_columns.values.ravel()
    return set(vocab_all_values)

def vocab_2_dict(sets):
    assert(len(sets) == 4)
    word_set = sets[0].union(sets[1],sets[2], sets[3])
    df = pd.DataFrame(list(word_set), columns=["Words"])
    df.sort_values(by="Words", inplace=True)
    df.reset_index(drop=True, inplace=True)
    return OrderedDict.fromkeys(word_set)

def get_vocab_dict():
    df_ods_vocab = pd.read_table('ods_fullforms_2020-08-26.csv', header=None)
    df_ddo_vocab = pd.read_table('ddo_fullforms_2020-08-26.csv', header=None)
    df_vocab = pd.read_table('cor1.02.tsv', header=None)
    df_sport_lingo = pd.read_table('sport_lingo.csv', header=None)

    vocab_set = vocab_2_pdset([1,3], df_vocab)
    ods_vocab_set = vocab_2_pdset([0,1], df_ods_vocab)
    ddo_vocab_set = vocab_2_pdset([0,1], df_ddo_vocab)
    sport_lingo_set = vocab_2_pdset([0], df_sport_lingo)

    return vocab_2_dict([vocab_set, ods_vocab_set, ddo_vocab_set, sport_lingo_set])

ordered_dict = get_vocab_dict()


In [135]:
df_sport = get_sports()

df_sport.to_csv('articles_temp/combined.csv')

In [136]:
import time
# isin_dict = False
def test_lookup_performance():
    word_to_check = "linebreak"
    start_time = time.time()

    for x in range(1000000):
        isin_dict = word_to_check in ordered_dict

    end_time = time.time()  
    assert(end_time - start_time < 1)
    print(isin_dict)

test_lookup_performance()

# isin_dict


False


In [137]:
# 




In [138]:
import re


df_sport_text = df_sport.iloc[:, [0,1,2]]

train_text = df_sport_text.apply(' '.join, axis=1).to_numpy()

words_arr = []
max_words_insentence = 0

def replace_digits(word):
    return re.sub(r'\d+', 'X', word)

def split_word(word):
    characters_to_remove = ["-"]
    new_word = word

    for char in characters_to_remove:
        new_word = new_word.replace(char, ' ')
    return new_word.split()

def remove_specials(word):
    characters_to_remove = [':', "'", '?', ",", ".", "(", ")", '"']
    new_word = word

    for char in characters_to_remove:
        new_word = new_word.replace(char, '')
    return new_word

def contains_non_alphanumeric(word):
    return bool(re.search(r'[^a-zæøåA-ZÆØÅ0-9]', word))

def formatWord(word):
    if any(char.isdigit() for char in word):
        return replace_digits(word)
    
    
for ind, sentence in enumerate(train_text):
    sentence_trimmed = sentence.strip()
    words = sentence_trimmed.split()
    current_count = len(words_arr)
    for word in words:
        w = word
        if contains_non_alphanumeric(w):
            w = remove_specials(w)
            w = split_word(w)
            words_arr.extend([x.lower() for x in w])
        else:
            words_arr.append(w.lower())

    after_count = len(words_arr) - current_count
    if after_count > max_words_insentence:
        max_words_insentence = after_count




words_sport_unique = set(words_arr)
words_sport_unique_list = list(words_sport_unique)
words_sport_lingo = []
words_train_vocab = []

# TODO : brug tensorflow Tokenezier til at omdanne ord til tokens
# TODO : søg i alle leksikoner, søg med og uden bindestreg
# TODO : håndter tal ikke i ordbøger eks ( x-x eller x-årig)
# TODO : lemmatizer : udelad bøjninger af samme navneord. eks : verdensmester/verdensmesteren
# TODO : evt. grupper ord der ofte hænger sammen med nltk BigramFinder. eks vandt over

for w in range(len(words_sport_unique_list)):
    word = words_sport_unique_list[w]
    if any(char.isdigit() for char in word):
        words_train_vocab.append(word)
    else: 
        isin_dict = word in ordered_dict
        if (isin_dict == False):
            words_sport_lingo.append(word)
        else:
            words_train_vocab.append(word)
print("Max words in sentence: ", max_words_insentence)
print("total unique words:", len(words_sport_unique) )
print("total sports lingo words:", len(words_sport_lingo) )
print("total vocab:", len(words_train_vocab))
print("total articles:", len(df_sport) )


Max words in sentence:  53
total unique words: 3851
total sports lingo words: 823
total vocab: 3028
total articles: 556


In [139]:
# def remove_stopwords(word):
#     if word in da_stopwords:
#         return True
#     else:
#         return False

# words_train_vocab = [x for x in words_train_vocab if not remove_stopwords(x)]

# words_train_vocab


In [140]:
# from collections import Counter
# cnt = Counter()
# for word in words_train_vocab:
#    cnt[word] += 1

# s = sorted(cnt.items(), key=lambda item: item[1])
# s.reverse()

# s


In [141]:
words_sport_lingo
file = open('words_sport_lingo.txt','w')
for item in words_sport_lingo:
	file.write(item+"\n")
file.close()


file = open('words_train_vocab.txt','w')
for item in sorted(words_train_vocab):
	file.write(item+"\n")
file.close()

In [142]:
def format_2_bool(x):
    if type(x) == bool:
        return x
    assert(type(x) == str)
    x_copy = x
    x_copy = x_copy.strip()
    x_copy = x_copy.lower()
    assert(x_copy == "true" or x_copy == "false")
    if x_copy == "true":
        return True
    else:
        return False

In [143]:

df_sport_labels = df_sport['isResult'].apply(lambda x: format_2_bool(x))

results_true = df_sport_labels.loc[df_sport_labels== True]
results_false = df_sport_labels.loc[df_sport_labels == False]

assert(len(results_true) + len(results_false) == len(df_sport_labels))

print(len(results_true))
print(len(results_false))
labels = df_sport_labels.to_numpy().astype(int)




255
301


In [144]:
# df_sport_text = df_sport.iloc[:, [0,1,2]]


# df_sport_text
# df_sport_text_combined = df_sport_text.apply(' '.join, axis=1)
# train_text = df_sport_text.apply(' linebreak '.join, axis=1).to_numpy()



print(len(labels))
print(len(train_text))
print(labels.shape)
print(train_text.shape)
longest_text = len(max(train_text, key=len))
print("longest text: ", longest_text)
train_text[0]

556
556
(556,)
(556,)
longest text:  312


'BASKETBALL  Hallen gyngede, og bjørnen dansede, da Bakken Bears spillede sig tættere på europæisk finale  Bakken Bears vandt den første af to semifinaler i FIBA Europe Cup med 74-72 over Unahotels Reggio Emilia fra Italien '

In [145]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

def split_dash(input_data):
  return tf.strings.regex_replace(input_data, '-', ' ')

def replace_digits(word):
    return tf.strings.regex_replace(word, pattern=r'\d+', rewrite=r'X')

def remove_specials(word):
    return tf.strings.regex_replace(word, pattern=r'[:,\'\."]', rewrite=r'')

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    replaced_digits = replace_digits(lowercase)
    removed_specials = remove_specials(replaced_digits)
    split_dashed = split_dash(removed_specials)
    return split_dashed


# Model constants.
max_features = 3000
sequence_length = 100

# Now that we have our custom standardization, we can instantiate our text
# vectorization layer. We are using this layer to normalize, split, and map
# strings to integers, so we set our 'output_mode' to 'int'.
# Note that we're using the default split function,
# and the custom standardization defined above.
# We also set an explicit maximum sequence length, since the CNNs later in our
# model won't support ragged sequences.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

text_ds = vectorize_layer.adapt(words_train_vocab)

vect_vocab = vectorize_layer.get_vocabulary()

text_vec = vectorize_layer([train_text[0]])

def vect_layer_2_text(vect_l):
    return np.array([vect_vocab[x] for x in np.trim_zeros(np.squeeze(vect_l.numpy()))])


# text = vect_layer_2_text(text_vec)

print("Total vocab/max_features : ",  len(vect_vocab))
print(vectorize_layer([train_text[0]]))
print("vect_2_text: ", vect_layer_2_text(vectorize_layer([train_text[0]])))




Total vocab/max_features :  2945
tf.Tensor(
[[2792 2005 2021 1184 2709 2562 2570 2811    1  630  786  276  988 2362
  2319 2811    1  186 2530 2121 2931  367  820 1811    1    1 2590 1384
     2    2 1129    1    1    1 2188    1    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]], shape=(1, 100), dtype=int64)
vect_2_text:  ['basketball' 'hallen' 'gyngede' 'og' 'bjørnen' 'dansede' 'da' 'bakken'
 '[UNK]' 'spillede' 'sig' 'tættere' 'på' 'europæisk' 'finale' 'bakken'
 '[UNK]' 'vandt' 'den' 'første' 'af' 'to' 'semifinaler' 'i' '[UNK]'
 '[UNK]' 'cup' 'med' 'X' 'X' 'over' '[UNK]' '[UNK]' '[UNK]' 'fra' '[UNK]']


In [146]:
def split_data(data, labels, percentage):
    l = len(data)
    p = l - int((percentage/100) * l)
    return (data[0:p], data[p:], labels[0:p], labels[p:])



train_data, val_data, train_labels, val_labels = split_data(vectorize_layer(train_text), labels, 10)

print("Total data points", len(train_text))
print("Train data length", len(train_data))
print("Valkidation data length", len(val_data))


Total data points 556
Train data length 501
Valkidation data length 55


In [147]:
class TransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="relu"), keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions



In [148]:
from tensorflow.keras import layers
import random as python_random



def get_transformer_model():

    embed_dim = 96  # Embedding size for each token
    num_heads = 2  # Number of attention heads
    ff_dim = 32  # Hidden layer size in feed forward network inside transformer


    # A integer input for vocab indices.
    inputs = tf.keras.Input(shape=(None,), dtype="int64")

    # Next, we add a layer to map those vocab indices into a space of dimensionality
    # 'embedding_dim'.
    x = layers.Embedding(max_features, embed_dim)(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    # x = layers.Dropout(0.1)(x)

    # Conv1D + global max pooling
    # x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)
    # x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)


    x = layers.GlobalMaxPooling1D()(x)

    # We add a vanilla hidden layer:
    x = layers.Dense(128, activation="relu")(x)
    # x = layers.Dropout(0.5)(x)

    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)


    transformer_model = tf.keras.Model(inputs, predictions)

    # Compile the model with binary crossentropy loss and an adam optimizer.
    transformer_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return transformer_model


In [149]:
from tensorflow.keras import layers
import random as python_random

def get_cnn_model():

    embedding_dim = 96

    # A integer input for vocab indices.
    inputs = tf.keras.Input(shape=(None,), dtype="int64")

    # Next, we add a layer to map those vocab indices into a space of dimensionality
    # 'embedding_dim'.
    x = layers.Embedding(max_features, embedding_dim)(inputs)
    x = layers.Dropout(0.5)(x)

    # Conv1D + global max pooling
    x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)
    x = layers.Conv1D(128, 10, padding="valid", activation="relu", strides=3)(x)


    x = layers.GlobalMaxPooling1D()(x)

    # We add a vanilla hidden layer:
    x = layers.Dense(128, activation="relu")(x)
    # x = layers.Dropout(0.5)(x)

    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)
    cnn_model = tf.keras.Model(inputs, predictions)

    # Compile the model with binary crossentropy loss and an adam optimizer.
    cnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return cnn_model



In [150]:

def prepare_model(name):
    if (name == "cnn"):
       return get_cnn_model()
    elif (name == "transformer"):
       return get_transformer_model()
  

def filter_max_accuracy(history, threshold = 0.99):
    acc = history.history["accuracy"]
    val_acc = history.history["val_accuracy"]
    list = []
    for x in range(len(acc)):
        if (acc[x] > threshold):
            list.append(val_acc[x])

    return np.array(list)

models = ["cnn", "transformer"]


def mean_model_accuracy(mode_names, iterations, epochs = 20):

    callback_3_loss = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=4)

    results = []

    for name in range(len(mode_names)):
        model_name = mode_names[name]
        val_accuracies = []
        
        for x in range(iterations):
            model = prepare_model(model_name)

            # Fit the model using the train and test datasets.
            history = model.fit(train_data, train_labels, epochs=epochs, batch_size=6, validation_data=(val_data, val_labels), callbacks=[callback_3_loss])

            max_val_acc = filter_max_accuracy(history)
            val_accuracies.append(max(max_val_acc))
            print(max(max_val_acc))
            print(val_accuracies)
        
        d = dict(name = model_name, results = np.mean(np.squeeze(np.array(val_accuracies))))
        results.append(d)
        
    return results


In [151]:
mean_results = mean_model_accuracy(models, 10)
mean_results

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
0.8181818127632141
[0.8181818127632141]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
0.8909090757369995
[0.8181818127632141, 0.8909090757369995]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
0.8727272748947144
[0.8181818127632141, 0.8909090757369995, 0.8727272748947144]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
0.8727272748947144
[0.8181818127632141, 0.8909090757369995, 0.8727272748947144, 0.8727272748947144]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
0.8727272748947144
[0.8181818127632141, 0.8909090757369995, 0.8727272748947144, 0.8727272748947144, 0.8727272748947144]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
0.9090909361839294
[0.8181818127632141, 0.8909090757369995, 0.8727272748947144, 0.8727272748947144, 0.8727272748947144, 0.9090909361839294]
Epoch 1/20
Epoch 2/2

[{'name': 'cnn', 'results': 0.8781818211078644},
 {'name': 'transformer', 'results': 0.907272732257843}]

In [152]:
def print_model_score(model):
    score = model.evaluate(val_data, val_labels, verbose=0)
    print("Validation loss:", score[0])
    print("Validations accuracy:", score[1])

def print_validation_results(predictions):
    for x in range(len(val_data)):
        print("VALIDATION SAMPLE: \n" ,vect_layer_2_text(val_data[x]))
        print("LABEL --:" , val_labels[x])
        print("PREDICTION --:" , predictions[x])
        print("\n")

In [178]:

epochs= 6
transformer_model = get_transformer_model()

# # Fit the model using the train and test datasets.
transformer_history = transformer_model.fit(train_data, train_labels, epochs=epochs, batch_size=6, validation_data=(val_data, val_labels))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [179]:

epochs= 6
cnn_model = get_cnn_model()

# # Fit the model using the train and test datasets.
transformer_history = cnn_model.fit(train_data, train_labels, epochs=epochs, batch_size=6, validation_data=(val_data, val_labels))

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [180]:
def print_results(model):
    np.set_printoptions(precision = 5, suppress = True)
    predictions = model.predict(val_data)
    print_model_score(model)
    print("\n")
    print_validation_results(predictions)
  

In [181]:
print("--- TRANSFORMER ---")
print_results(transformer_model)

--- TRANSFORMER ---


Validation loss: 0.1961260288953781
Validations accuracy: 0.9272727370262146


VALIDATION SAMPLE: 
 ['basketball' 'efter' 'X' 'års' 'tørke' 'skal' 'der' 'igen' 'spilles'
 '[UNK]' 'i' '[UNK]' '[UNK]' 'har' 'fået' 'godkendt' 'sin' 'oprykning'
 'og' 'dermed' 'spiller' 'de' 'i' 'landets' 'bedste' '[UNK]' 'den'
 'kommende' 'sæson']
LABEL --: 0
PREDICTION --: [0.00956]


VALIDATION SAMPLE: 
 ['champions' 'league' '[UNK]' 'scorer' 'første' '[UNK]' 'mål' 'mens'
 '[UNK]' 'indtager' 'uheldig' 'hovedrolle' 'mod' '[UNK]' '[UNK]' '[UNK]'
 'tabte' 'onsdag' 'aften' 'X' 'X' 'mod' '[UNK]' '[UNK]' 'i' 'champions'
 'league']
LABEL --: 1
PREDICTION --: [0.99986]


VALIDATION SAMPLE: 
 ['fodbold' 'parkens' 'bane' 'får' '[UNK]' 'til' 'at' 'overveje' 'særligt'
 'tiltag' '[UNK]' '[UNK]' 'ønsker' 'der' 'bliver' 'brugt' 'flere' 'penge'
 'på' 'baner']
LABEL --: 0
PREDICTION --: [0.00292]


VALIDATION SAMPLE: 
 ['sport' 'nu' 'forbydes' 'udbredt' '[UNK]' 'både' 'præstationsfremmende'
 'og' 'sundhedsskadelig' 'fra'

In [182]:

print("--- CNN ---")
print_results(cnn_model)

--- CNN ---
Validation loss: 0.7975172996520996
Validations accuracy: 0.800000011920929


VALIDATION SAMPLE: 
 ['basketball' 'efter' 'X' 'års' 'tørke' 'skal' 'der' 'igen' 'spilles'
 '[UNK]' 'i' '[UNK]' '[UNK]' 'har' 'fået' 'godkendt' 'sin' 'oprykning'
 'og' 'dermed' 'spiller' 'de' 'i' 'landets' 'bedste' '[UNK]' 'den'
 'kommende' 'sæson']
LABEL --: 0
PREDICTION --: [0.00006]


VALIDATION SAMPLE: 
 ['champions' 'league' '[UNK]' 'scorer' 'første' '[UNK]' 'mål' 'mens'
 '[UNK]' 'indtager' 'uheldig' 'hovedrolle' 'mod' '[UNK]' '[UNK]' '[UNK]'
 'tabte' 'onsdag' 'aften' 'X' 'X' 'mod' '[UNK]' '[UNK]' 'i' 'champions'
 'league']
LABEL --: 1
PREDICTION --: [0.99996]


VALIDATION SAMPLE: 
 ['fodbold' 'parkens' 'bane' 'får' '[UNK]' 'til' 'at' 'overveje' 'særligt'
 'tiltag' '[UNK]' '[UNK]' 'ønsker' 'der' 'bliver' 'brugt' 'flere' 'penge'
 'på' 'baner']
LABEL --: 0
PREDICTION --: [0.00027]


VALIDATION SAMPLE: 
 ['sport' 'nu' 'forbydes' 'udbredt' '[UNK]' 'både' 'præstationsfremmende'
 'og' 'sundhedsskad

In [183]:
import os

# Set up a logs directory, so Tensorboard knows where to look for files.

ll = transformer_model.layers[1]
ll_weights = ll.get_weights()[0]

print(ll_weights.shape)
ll_weights


(3000, 96)


array([[ 0.02131, -0.01997, -0.01069, ...,  0.00469,  0.00862,  0.00129],
       [ 0.0328 , -0.0461 , -0.02971, ..., -0.02649,  0.00621, -0.01536],
       [ 0.02183, -0.05466, -0.06168, ..., -0.07601,  0.04855,  0.01207],
       ...,
       [-0.00248,  0.03172, -0.0352 , ..., -0.00791, -0.04032, -0.01334],
       [ 0.001  , -0.01721,  0.03958, ..., -0.0372 , -0.004  , -0.00472],
       [ 0.01472,  0.00941, -0.04858, ..., -0.00649, -0.01509, -0.0265 ]],
      dtype=float32)

In [184]:
##import I/O module in python
import io

##open the text stream for vectors
vectors = io.open('vectors.tsv', 'w', encoding='utf-8')

##open the text stream for metadata
meta = io.open('meta.tsv', 'w', encoding='utf-8')


##write each word and its corresponding embedding
for index in range(1, len(vect_vocab)):
  word = vect_vocab[index]  # flipping the key-value in word_index
  embeddings = ll_weights[index]
  meta.write(word + "\n")
  vectors.write('\t'.join([str(x) for x in embeddings]) + "\n")

##close the stream
vectors.close()
meta.close()

In [185]:
from nltk import collocations
bigram_measures = collocations.BigramAssocMeasures()
finder = collocations.BigramCollocationFinder.from_words(["New", "York", "is", "big", "New", "York", "is", "pretty"])
finder.ngram_fd.items()



dict_items([(('New', 'York'), 2), (('York', 'is'), 2), (('is', 'big'), 1), (('big', 'New'), 1), (('is', 'pretty'), 1)])

In [186]:
import lemmy
# Create an instance of the standalone lemmatizer.
lemmatizer = lemmy.load("da")

# Find lemma for the word 'akvariernes'. First argument is an empty POS tag.
lemmatizer.lemmatize("NOUN", "storsejr")



['storsejr']

In [187]:
import nltk as nltk
# from string import punctuation
# from nltk.corpus import stopwords
# nltk.download('stopwords')

# da_stopwords = stopwords.words("danish")


In [225]:
# A string input
inputs = tf.keras.Input(shape=(1,), dtype="string")
# Turn strings into vocab indices
indices = vectorize_layer(inputs)
# Turn vocab indices into predictions
outputs = transformer_model(indices)

# Our end to end model
end_to_end_model = tf.keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
)


In [226]:
print("\nSkisport, Results: ")
print( end_to_end_model.predict(
    [
    "Skisport Buller overrasker alle og gør det umulige. Ny rekord sikrer flerer medalje til Norge",
    "Skisport Buller overrasker alle. Hollænderen havde masser af overskud, da han kom i mål som numkmer to",
     ]))

print("\nSkisport, other: ")

print( end_to_end_model.predict(
    [
    "Skisport Sverige drømmer om flere medaljer og sejre til næste års OL. Træner har et godt øje til hidtil ukendt talent",
    "Skisport Flere spår medaljer til Sverige ved kommende OL. 'Vi kommer til at se mange gode præstationer'",
    "Skisport 'Denne gang er det alvor'. Buller vil vinde dette års OL",
     ]))

print("\nFodbold, Results:")

print(end_to_end_model.predict(
    [
      "Fodbold Fjerritslev vinder over Vordingborg. Hjemmeholdets træner kommenterer på historisk kamp",
      "Fodbold Fjerritslev sejrer over Vordingborg. Hjemmeholdets træner kommenterer på historisk kamp",
     ]))

print("\nFodbold, other:")
print(end_to_end_model.predict(
    [
     "Fodbold Fjerritslevs træner forventer sejr over Vordingborg. Det bliver en historisk kamp",
      "Fodbold Træner forventer at Fjerritslev vinder over Vordingborg. Det bliver en historisk kamp",
      "Fodbold Fjerritslev forventes at vinde over Vordingborg. Hjemmeholdets publikum er ikke i tvivl",
      "Fodbold 'Fjerritslev må og skal vinde over Vordingborg', udtaler hjemmeholdets træner. Vi kan ikke leve med andet",
     ]))


Skisport, Results: 
[[0.97874]
 [0.90872]]

Skisport, other: 
[[0.00995]
 [0.13936]
 [0.1064 ]]

Fodbold, Results:
[[0.94201]
 [0.52605]]

Fodbold, other:
[[0.03791]
 [0.056  ]
 [0.03656]
 [0.0002 ]]
