In [489]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from collections import OrderedDict
def get_sports():
    df_sport_latest = pd.read_csv('sports_articles.csv', encoding = "ISO-8859-1")
    df_sport_2022 = pd.read_csv('sports_articles_2022.csv', encoding = "ISO-8859-1")
    df = pd.concat([df_sport_latest, df_sport_2022])
    df = df.sample(frac=1).reset_index(drop=True)
    return df


In [490]:
def vocab_2_pdset(columns, df):
    df_vocab_select_columns = df.iloc[:, columns]
    vocab_all_values = df_vocab_select_columns.values.ravel()
    return set(vocab_all_values)

def vocab_2_dict(sets):
    assert(len(sets) == 4)
    word_set = sets[0].union(sets[1],sets[2], sets[3])
    df = pd.DataFrame(list(word_set), columns=["Words"])
    df.sort_values(by="Words", inplace=True)
    df.reset_index(drop=True, inplace=True)
    return OrderedDict.fromkeys(word_set)

def get_vocab_dict():
    df_ods_vocab = pd.read_table('ods_fullforms_2020-08-26.csv', header=None)
    df_ddo_vocab = pd.read_table('ddo_fullforms_2020-08-26.csv', header=None)
    df_vocab = pd.read_table('cor1.02.tsv', header=None)
    df_sport_lingo = pd.read_table('sport_lingo.csv', header=None)

    vocab_set = vocab_2_pdset([1,3], df_vocab)
    ods_vocab_set = vocab_2_pdset([0,1], df_ods_vocab)
    ddo_vocab_set = vocab_2_pdset([0,1], df_ddo_vocab)
    sport_lingo_set = vocab_2_pdset([0], df_sport_lingo)

    return vocab_2_dict([vocab_set, ods_vocab_set, ddo_vocab_set, sport_lingo_set])

ordered_dict = get_vocab_dict()


In [491]:
df_sport = get_sports()

In [492]:
import time
# isin_dict = False
def test_lookup_performance():
    word_to_check = "pligtsejrer"
    start_time = time.time()

    for x in range(1000000):
        isin_dict = word_to_check in ordered_dict

    end_time = time.time()  
    assert(end_time - start_time < 1)
    print(isin_dict)

test_lookup_performance()

# isin_dict


True


In [493]:
import re


df_sport_text = df_sport.iloc[:, [0,1,2]]

sport_vocab = df_sport_text.values.ravel()

words_arr = []

def replace_digits(word):
    return re.sub(r'\d+', 'X', word)

def remove_specials(word):
    characters_to_remove = [':', "'", '?', ",", "."]
    new_word = word

    for char in characters_to_remove:
        new_word = new_word.replace(char, '')
    return new_word

def contains_non_alphanumeric(word):
    return bool(re.search(r'[^a-zæøåA-ZÆØÅ0-9]', word))

def formatWord(word):
    if any(char.isdigit() for char in word):
        return replace_digits(word)
    
    
for sentences in range(len(sport_vocab)):
    # print(sport_vocab[sentences])
    sentence = sport_vocab[sentences].strip()
    words = sentence.split()
    for word in range(len(words)):
        w = words[word]
        # if any(char.isdigit() for char in w):
        #     w = replace_digits(w)
        # if contains_non_alphanumeric(w):
        #     w = remove_specials(w)
        words_arr.append(w.lower())

words_sport_unique = set(words_arr)
words_sport_unique_list = list(words_sport_unique)
words_sport_lingo = []
words_train_vocab = []

# TODO : brug tensorflow Tokenezier til at omdanne ord til tokens
# TODO : søg i alle leksikoner, søg med og uden bindestreg
# TODO : håndter tal ikke i ordbøger eks ( x-x eller x-årig)

for w in range(len(words_sport_unique)):
    isin_dict = words_sport_unique_list[w] in ordered_dict
    if (isin_dict == False):
        words_sport_lingo.append(words_sport_unique_list[w])
    else:
        words_train_vocab.append(words_sport_unique_list[w])

print("total unique words:", len(words_sport_unique) )
print("total sports lingo words:", len(words_sport_lingo) )
print("total vocab:", len(words_train_vocab))
print("total articles:", len(df_sport) )


total unique words: 2739
total sports lingo words: 978
total vocab: 1761
total articles: 290


In [494]:
words_sport_lingo
file = open('sport_lingo.txt','w')
for item in words_sport_lingo:
	file.write(item+"\n")
file.close()


file = open('sport_vocab.txt','w')
for item in words_train_vocab:
	file.write(item+"\n")
file.close()

In [495]:
def format_2_bool(x):
    if type(x) == bool:
        return x
    assert(type(x) == str)
    x_copy = x
    x_copy = x_copy.strip()
    x_copy = x_copy.lower()
    assert(x_copy == "true" or x_copy == "false")
    if x_copy == "true":
        return True
    else:
        return False

In [496]:

df_sport_labels = df_sport['isResult'].apply(lambda x: format_2_bool(x))

results_true = df_sport_labels.loc[df_sport_labels== True]
results_false = df_sport_labels.loc[df_sport_labels == False]

assert(len(results_true) + len(results_false) == len(df_sport_labels))

print(len(results_true))
print(len(results_false))
labels = df_sport_labels.to_numpy().astype(int)

print(df_sport_labels.to_numpy())
labels




146
144
[ True False False False  True False False False False False False  True
 False  True False  True False  True False  True  True  True False False
  True False  True False False  True  True False False  True False False
  True False  True False False  True False  True  True False False False
 False  True False False False False  True False  True  True False  True
 False False  True False False False False False  True  True False False
  True  True False False  True False  True  True False  True  True  True
 False  True  True  True  True False False  True False False  True False
 False  True False  True False  True  True  True False False False  True
  True False  True False  True  True False False  True False False  True
  True  True  True  True  True  True  True  True  True  True  True  True
 False False  True  True False False False  True  True False False  True
  True  True  True False  True  True  True False  True False  True False
  True  True False  True  True  True  True 

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0])

In [497]:
df_sport_text = df_sport.iloc[:, [0,1,2]]


df_sport_text
df_sport_text_combined = df_sport_text.apply(' '.join, axis=1)
train_text = df_sport_text.apply(' '.join, axis=1).to_numpy()


print (train_text[280:290])
# df_sport_text_combined.to_numpy().shape

df_sport_labels[280:290]

df_sport_text[280:290]
print(len(labels))
print(len(train_text))
print(labels.shape)
print(train_text.shape)
print("longest text: ", len(max(train_text, key=len)))

['HERRELANDSHOLDET  Så længe landsholdet vinder, behøver det ikke være en berusende fodboldfest  Det danske landshold ved godt, at de skal vinde, når de møder Finland i en afgørende gruppekamp i EM-kvalifikation '
 'KVINDELANDSHOLDET  Dobbelt dansk målscorer sikrer perfekt start på ny landsholdsperiode  Danmarks fodboldlandshold har fredag aften slået Tyskland 2-0 i den første Nations League-kamp '
 'EM HÅNDBOLD  KARAKTERER Trods storsejr får kun tre danskere høje karakterer:\xa0Bedøm selv spillerne her  Dyk ned i karaktererne til de danske spillere efter sejren over Kroatien i den anden kamp i mellemrunden ved EM '
 'BADMINTON  Danskers nye superserv vækker bekymring - topspillere kræver forbud  Marcus Rindshøj har nyfortolket en serv, som får Det Internationale Badmintonforbund, BWF, op af stolen '
 "CYKLING  Vingegaard har fundet sit særlige supervåben frem:\xa0'Han kan noget, som ingen andre kan'  Jonas Vingegaard virker igen til at have fundet topniveauet i den sidste uge af et af

In [498]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

def replace_digits(word):
    return tf.strings.regex_replace(word, pattern=r'\d+', rewrite=r'X')

def remove_specials(word):
    return tf.strings.regex_replace(word, pattern=r'[:,\'\.]', rewrite=r'')

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return remove_specials(replace_digits(lowercase))


# Model constants.
max_features = 2000
embedding_dim = 128
sequence_length = 400

# Now that we have our custom standardization, we can instantiate our text
# vectorization layer. We are using this layer to normalize, split, and map
# strings to integers, so we set our 'output_mode' to 'int'.
# Note that we're using the default split function,
# and the custom standardization defined above.
# We also set an explicit maximum sequence length, since the CNNs later in our
# model won't support ragged sequences.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

text_ds = vectorize_layer.adapt(words_train_vocab)

vect_vocab = vectorize_layer.get_vocabulary()

text_vec = vectorize_layer([train_text[5]])

def vect_layer_2_text(vect_l):
    return np.array([vect_vocab[x] for x in np.squeeze(vect_l.numpy())])


# text = vect_layer_2_text(text_vec)

print(text_vec)
print(len(vect_vocab))
vect_layer_2_text(text_vec)





tf.Tensor(
[[1424    1  133 1163 1095 1047  730  757    1  241 1716 1723 1495 1100
   809  607 1527 1085 1252  798  647 1065 1402  791    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0  

array(['em', '[UNK]', 'usynlige', 'helte', 'hyldes', 'inden',
       'nabobraget', 'mod', '[UNK]', 'tirsdag', 'aften', 'afgøres', 'det',
       'hvor', 'mange', 'point', 'danmarks', 'håndboldkvinder', 'får',
       'med', 'over', 'i', 'em-turneringens', 'mellemrunde', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '

In [499]:


train_ds = vectorize_layer(train_text)

train_data = train_ds[0:230]
val_data = train_ds[230:]

train_labels = labels[0:230]
val_labels = labels[230:]

train_ds
val
# labels.shape

<tf.Tensor: shape=(90, 400), dtype=int64, numpy=
array([[1654,  101, 1445, ...,    0,    0,    0],
       [1654,    1,   83, ...,    0,    0,    0],
       [1654,    1,    1, ...,    0,    0,    0],
       ...,
       [1424,    1,  904, ...,    0,    0,    0],
       [1424,    1,  133, ...,    0,    0,    0],
       [   1,  903,  281, ...,    0,    0,    0]])>

In [506]:
from tensorflow.keras import layers

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
# x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 32, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 32, padding="valid", activation="relu", strides=3)(x)

x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [507]:
epochs = 10
# Fit the model using the train and test datasets.
model.fit(train_data, train_labels, epochs=epochs, batch_size=8, validation_data=(val_data, val_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4b8215f9a0>

In [502]:
score = model.evaluate(val_data, val_labels, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 0.4993991255760193
Test accuracy: 0.8833333253860474


In [503]:
np.set_printoptions(precision = 2, suppress = True)
predictions = model.predict(val_data[10:30])




print(predictions)
print("labels:")
print(val_labels[10:30])

[[0.99]
 [0.  ]
 [0.23]
 [1.  ]
 [0.98]
 [0.02]
 [0.  ]
 [0.  ]
 [1.  ]
 [1.  ]
 [1.  ]
 [1.  ]
 [0.12]
 [0.73]
 [0.  ]
 [0.88]
 [1.  ]
 [0.  ]
 [1.  ]
 [0.99]]
labels:
[1 1 0 1 1 0 0 0 1 1 1 1 0 1 0 0 1 0 1 0]


In [504]:
num = np.array([1.8e-10, 1.586, 150.45, 0.2855])
 
# Suppressing 1-D numpy array with precision 2
# using numpy.set_printoptions()
print("Numpy array values with precision 2:\n")

print(num)

Numpy array values with precision 2:

[  0.     1.59 150.45   0.29]


In [505]:
print(vect_layer_2_text(val_data[11]))
val_labels[11]

['badminton' 'vi' 'er' 'blandt' 'de' 'bedste' 'i' '[UNK]' 'trofæ' 'giver'
 'badmintonstjerner' 'tro' 'på' 'ol-succes' 'weekendens' 'sejr' 'i'
 '[UNK]' 'er' 'endnu' 'et' 'bevis' 'på' 'at' 'herredoublen' 'kim' '[UNK]'
 'og' '[UNK]' '[UNK]' '[UNK]' 'er' 'i' 'storform' '' '' '' '' '' '' '' ''
 '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' ''
 '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' ''
 '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' ''
 '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' ''
 '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' ''
 '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' ''
 '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' ''
 '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' ''
 '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' ''
 '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '

1