# Einlesen der Trainings-Daten

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras import layers

train_data = pd.read_csv(
    "./fragen/train.csv",
    usecols= ['question1', 'question2', 'is_duplicate'],
    encoding='utf-8'
)

train_data.dropna(inplace=True)
train_data.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


# Create the Dataset

In [2]:
target = train_data.pop('is_duplicate')

ds_raw = tf.data.Dataset.from_tensor_slices((train_data.values, target.values))

for ex in ds_raw.take(3):
    print(ex)

(<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'What is the step by step guide to invest in share market in india?',
       b'What is the step by step guide to invest in share market?'],
      dtype=object)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
       b'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?'],
      dtype=object)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'How can I increase the speed of my internet connection while using a VPN?',
       b'How can Internet speed be increased by hacking through DNS?'],
      dtype=object)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)


2022-02-25 11:35:51.241867: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-25 11:35:51.277593: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-25 11:35:51.277913: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-25 11:35:51.310998: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

# Validation and Trainingsdata

In [3]:
tf.random.set_seed(35)

ds_length = len(ds_raw)
ds_raw_train = ds_raw.take(round(ds_length*0.8)-1)
ds_raw_train_valid = ds_raw.take(round(ds_length*0.2)-1)

# Create dictionary

In [4]:
from collections import Counter

tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()

for example in ds_raw_train:
    for i in range(0,2):
        tokens = tokenizer.tokenize(example[0].numpy()[i])
        token_counts.update(tokens)

print("Größe des Wörterbuches: ", len(token_counts))

Größe des Wörterbuches:  99353


# Tokenize dataset

In [5]:
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
encoder.encode('This is a example')

def encode(text_tensor, label):
    text0 = text_tensor.numpy()[0]
    text1 = text_tensor.numpy()[1]
    return encoder.encode(text0), encoder.encode(text1), label

def encode_map_fn(text, label):
    return tf.py_function(encode,
                          inp=[text, label],
                          Tout=[tf.int64, tf.int64, tf.int64])

ds_train = ds_raw_train.map(encode_map_fn)
ds_train_valid = ds_raw_train_valid.map(encode_map_fn)


# Batch data

In [6]:
train_data_batch = ds_train.padded_batch( 32, padded_shapes=([-1],[-1],[]))

valid_data_batch = ds_train_valid.padded_batch( 32, padded_shapes=([-1],[-1],[]))

train_data_batch = train_data_batch.map(lambda x1, x2, y: ((x1, x2), y))
valid_data_batch = valid_data_batch.map(lambda x1, x2, y: ((x1, x2), y))
for x in train_data_batch.take(1):
    print(x)

((<tf.Tensor: shape=(32, 29), dtype=int64, numpy=
array([[  1,   2,   3,   4,   5,   4,   6,   7,   8,   9,  10,  11,   9,
         12,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [  1,   2,   3,  13,  14,  15,  16,  17,  18,  19,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [ 28,  29,  30,  31,   3,  32,  14,  33,  34,  35,  36,  37,  38,
         39,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [ 46,  47,  30,  48,  49,  50,  28,  29,  30,  51,  52,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [ 60,  61,  62,   9,  63,  64,  65,  66,  67,  68,  69,  70,  71,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [ 74,  30,  47,  38,  75,  76,  77,  78,  68,  79,  80,  81,  82,
         83,  84,  85,  86,

# Create the model

In [7]:
from keras.layers import Embedding, SimpleRNN, Dense

lstm_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences = False))

size_dic = len(token_counts) + 2

emb = Embedding(size_dic, 100, input_length=300)

input1 = tf.keras.Input(shape=(300,))
input2 = tf.keras.Input(shape=(300,))

e1 = emb(input1)
e2 = emb(input2)
x1 = lstm_layer(e1)
x2 = lstm_layer(e2)

mhd = lambda x: tf.keras.backend.abs(x[0] - x[1])
merged = tf.keras.layers.Lambda(function=mhd, output_shape= lambda x: x[0])([x1, x2])
preds = tf.keras.layers.Dense(1)(merged)
model = tf.keras.Model(inputs=[input1, input2], outputs=preds)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 300)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 300)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 300, 100)     9935500     ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 bidirectional (Bidirectional)  (None, 128)          84480       ['embedding[0][0]',          

# Train the model

In [8]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=['accuracy'])
history = model.fit(train_data_batch, validation_data=valid_data_batch, epochs=5)

Epoch 1/5
    1/10108 [..............................] - ETA: 14:16:24 - loss: 4.1430 - accuracy: 0.6562

2022-02-25 11:36:47.199472: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8301


  642/10108 [>.............................] - ETA: 5:54 - loss: 0.8075 - accuracy: 0.6121

KeyboardInterrupt: 

# Save model

In [None]:
import os
test_results = model.evaluate(train_data)
print(f'Test acc : {test_results}')
if not os.path.exists('models'):
    os.mkdir('models')

model.save('models/goodone.h5')