In [1172]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from collections import OrderedDict
def get_sports():
    df_sport_latest = pd.read_csv('sports_articles.csv', encoding = "ISO-8859-1")
    df_sport_2020 = pd.read_csv('sports_articles_2020.csv', encoding = "ISO-8859-1")
    df_sport_2022 = pd.read_csv('sports_articles_2022.csv', encoding = "ISO-8859-1")
    df = pd.concat([df_sport_latest, df_sport_2020, df_sport_2022])
    df = df.sample(frac=1).reset_index(drop=True)
    return df


In [1173]:
def vocab_2_pdset(columns, df):
    df_vocab_select_columns = df.iloc[:, columns]
    vocab_all_values = df_vocab_select_columns.values.ravel()
    return set(vocab_all_values)

def vocab_2_dict(sets):
    assert(len(sets) == 4)
    word_set = sets[0].union(sets[1],sets[2], sets[3])
    df = pd.DataFrame(list(word_set), columns=["Words"])
    df.sort_values(by="Words", inplace=True)
    df.reset_index(drop=True, inplace=True)
    return OrderedDict.fromkeys(word_set)

def get_vocab_dict():
    df_ods_vocab = pd.read_table('ods_fullforms_2020-08-26.csv', header=None)
    df_ddo_vocab = pd.read_table('ddo_fullforms_2020-08-26.csv', header=None)
    df_vocab = pd.read_table('cor1.02.tsv', header=None)
    df_sport_lingo = pd.read_table('sport_lingo.csv', header=None)

    vocab_set = vocab_2_pdset([1,3], df_vocab)
    ods_vocab_set = vocab_2_pdset([0,1], df_ods_vocab)
    ddo_vocab_set = vocab_2_pdset([0,1], df_ddo_vocab)
    sport_lingo_set = vocab_2_pdset([0], df_sport_lingo)

    return vocab_2_dict([vocab_set, ods_vocab_set, ddo_vocab_set, sport_lingo_set])

ordered_dict = get_vocab_dict()


In [1174]:
df_sport = get_sports()

In [1175]:
import time
# isin_dict = False
def test_lookup_performance():
    word_to_check = "linebreak"
    start_time = time.time()

    for x in range(1000000):
        isin_dict = word_to_check in ordered_dict

    end_time = time.time()  
    assert(end_time - start_time < 1)
    print(isin_dict)

test_lookup_performance()

# isin_dict


True


In [1176]:
import re


df_sport_text = df_sport.iloc[:, [0,1,2]]

train_text = df_sport_text.apply(' '.join, axis=1).to_numpy()

words_arr = []

def replace_digits(word):
    return re.sub(r'\d+', 'X', word)

def remove_specials(word):
    characters_to_remove = [':', "'", '?', ",", "."]
    new_word = word

    for char in characters_to_remove:
        new_word = new_word.replace(char, '')
    return new_word

def contains_non_alphanumeric(word):
    return bool(re.search(r'[^a-zæøåA-ZÆØÅ0-9]', word))

def formatWord(word):
    if any(char.isdigit() for char in word):
        return replace_digits(word)
    
    
for sentences in range(len(train_text)):
    # print(sport_vocab[sentences])
    sentence = train_text[sentences].strip()
    words = sentence.split()
    for word in range(len(words)):
        w = words[word]
        if contains_non_alphanumeric(w):
            w = remove_specials(w)
        words_arr.append(w.lower())

words_sport_unique = set(words_arr)
words_sport_unique_list = list(words_sport_unique)
words_sport_lingo = []
words_train_vocab = []

# TODO : brug tensorflow Tokenezier til at omdanne ord til tokens
# TODO : søg i alle leksikoner, søg med og uden bindestreg
# TODO : håndter tal ikke i ordbøger eks ( x-x eller x-årig)

for w in range(len(words_sport_unique_list)):
    word = words_sport_unique_list[w]
    if any(char.isdigit() for char in word):
        words_train_vocab.append(word)
    else: 
        isin_dict = word in ordered_dict
        if (word == "linebreak"):
            print("found")
        if (isin_dict == False):
            words_sport_lingo.append(word)
        else:
            words_train_vocab.append(word)

print("total unique words:", len(words_sport_unique) )
print("total sports lingo words:", len(words_sport_lingo) )
print("total vocab:", len(words_train_vocab))
print("total articles:", len(df_sport) )


total unique words: 2538
total sports lingo words: 539
total vocab: 1999
total articles: 299


In [1177]:
words_sport_lingo
file = open('sport_lingo.txt','w')
for item in words_sport_lingo:
	file.write(item+"\n")
file.close()


file = open('sport_vocab.txt','w')
for item in words_train_vocab:
	file.write(item+"\n")
file.close()

In [1178]:
def format_2_bool(x):
    if type(x) == bool:
        return x
    assert(type(x) == str)
    x_copy = x
    x_copy = x_copy.strip()
    x_copy = x_copy.lower()
    assert(x_copy == "true" or x_copy == "false")
    if x_copy == "true":
        return True
    else:
        return False

In [1179]:

df_sport_labels = df_sport['isResult'].apply(lambda x: format_2_bool(x))

results_true = df_sport_labels.loc[df_sport_labels== True]
results_false = df_sport_labels.loc[df_sport_labels == False]

assert(len(results_true) + len(results_false) == len(df_sport_labels))

print(len(results_true))
print(len(results_false))
labels = df_sport_labels.to_numpy().astype(int)




154
145


In [1180]:
# df_sport_text = df_sport.iloc[:, [0,1,2]]


# df_sport_text
# df_sport_text_combined = df_sport_text.apply(' '.join, axis=1)
# train_text = df_sport_text.apply(' linebreak '.join, axis=1).to_numpy()



print(len(labels))
print(len(train_text))
print(labels.shape)
print(train_text.shape)
print("longest text: ", len(max(train_text, key=len)))
train_text[0]

299
299
(299,)
(299,)
longest text:  312


"BASKETBALL  Dansk basketstjerne fik aldrig en ægte chance i USA - nu kan han ikke gå i fred i Bologna  Gabriel 'Iffe' Lundberg nyder stor succes i Europas bedste basketball-liga efter en hektisk periode "

In [1181]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re


def replace_digits(word):
    return tf.strings.regex_replace(word, pattern=r'\d+', rewrite=r'X')

def remove_specials(word):
    return tf.strings.regex_replace(word, pattern=r'[:,\'\.]', rewrite=r'')

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    replaced_digits = replace_digits(lowercase)
    removed_specials = remove_specials(replaced_digits)
    return removed_specials


# Model constants.
max_features = 2000
embedding_dim = 64
sequence_length = 350

# Now that we have our custom standardization, we can instantiate our text
# vectorization layer. We are using this layer to normalize, split, and map
# strings to integers, so we set our 'output_mode' to 'int'.
# Note that we're using the default split function,
# and the custom standardization defined above.
# We also set an explicit maximum sequence length, since the CNNs later in our
# model won't support ragged sequences.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

text_ds = vectorize_layer.adapt(words_train_vocab)

vect_vocab = vectorize_layer.get_vocabulary()

text_vec = vectorize_layer([train_text[0]])

def vect_layer_2_text(vect_l):
    return np.array([vect_vocab[x] for x in np.trim_zeros(np.squeeze(vect_l.numpy()))])


# text = vect_layer_2_text(text_vec)

print(text_vec)
print(len(vect_vocab))
vect_layer_2_text(text_vec)





tf.Tensor(
[[1827 1681    1 1485 1892 1538   21 1723 1169    1    1  779 1097 1288
  1160 1304 1169 1395 1169    1    1    1    1  776  376  352 1169 1511
  1814    1 1580 1538 1280  685    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0  

array(['basketball', 'dansk', '[UNK]', 'fik', 'aldrig', 'en', 'ægte',
       'chance', 'i', '[UNK]', '[UNK]', 'nu', 'kan', 'han', 'ikke', 'gå',
       'i', 'fred', 'i', '[UNK]', '[UNK]', '[UNK]', '[UNK]', 'nyder',
       'stor', 'succes', 'i', 'europas', 'bedste', '[UNK]', 'efter', 'en',
       'hektisk', 'periode'], dtype='<U10')

In [1182]:


train_ds = vectorize_layer(train_text)

train_data = train_ds[0:240]
val_data = train_ds[240:]

train_labels = labels[0:240]
val_labels = labels[240:]

train_ds
val
# labels.shape

<tf.Tensor: shape=(90, 400), dtype=int64, numpy=
array([[1654,  101, 1445, ...,    0,    0,    0],
       [1654,    1,   83, ...,    0,    0,    0],
       [1654,    1,    1, ...,    0,    0,    0],
       ...,
       [1424,    1,  904, ...,    0,    0,    0],
       [1424,    1,  133, ...,    0,    0,    0],
       [   1,  903,  281, ...,    0,    0,    0]])>

In [1183]:
from tensorflow.keras import layers

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 20, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 20, padding="valid", activation="relu", strides=3)(x)

x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [1184]:
epochs = 12
# Fit the model using the train and test datasets.
model.fit(train_data, train_labels, epochs=epochs, batch_size=4, validation_data=(val_data, val_labels))

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<tensorflow.python.keras.callbacks.History at 0x7f4b96f631f0>

In [1185]:
score = model.evaluate(val_data, val_labels, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 1.4483816623687744
Test accuracy: 0.7796609997749329


In [1186]:
np.set_printoptions(precision = 2, suppress = True)
predictions = model.predict(val_data)

# print(predictions)
# print("labels:")
# print(val_labels[0:10])

In [1187]:
for x in range(len(val_data)):
    print("\n" ,vect_layer_2_text(val_data[x]))
    print("-- LABEL --:" , val_labels[x])
    print("-- Prediction --:" , predictions[x])



 ['superliga' '[UNK]' '[UNK]' 'vandt' 'dramatisk' 'topopgør' 'mod' '[UNK]'
 'weekendens' 'mål' 'her' 'se' 'alle' 'mål' 'fra' 'alle' 'weekendens'
 'superligakampe' 'her' 'i' 'artiklen']
-- LABEL --: 1
-- Prediction --: [1.]

 ['kvindelandsholdet' '[UNK]' 'jubler' 'over' 'hattrick' 'og' '[UNK]'
 'kunne' 'ikke' 'være' 'bedre' '[UNK]' '[UNK]' 'glæder' 'sig' 'over' 'at'
 'hun' '[UNK]' 'og' 'resten' 'af' 'landsholdet' '[UNK]' 'for' 'alvor'
 'har' 'fået' 'gang' 'i' 'den' 'offensive' 'produktion']
-- LABEL --: 1
-- Prediction --: [0.]

 ['basketball' 'efter' 'X' 'års' 'tørke' 'skal' 'der' 'igen' 'spilles'
 '[UNK]' 'i' '[UNK]' '[UNK]' 'har' 'fået' 'godkendt' 'sin' 'oprykning'
 'og' 'dermed' 'spiller' 'de' 'i' 'landets' 'bedste' '[UNK]' 'den'
 'kommende' 'sæson']
-- LABEL --: 0
-- Prediction --: [0.]

 ['boksning' 'rørt' 'boksestjerne' 'genvinder' 'vm-titlen' 'og' 'hiver'
 'det' 'eftertragtede' 'bælte' 'hjem' '[UNK]' 'har' 'fået' 'sin' 'egen'
 'faste' 'plads' 'med' 'dommerstemmerne' 'X-X' 'fors