### Importing the data

In [None]:
import pickle

def load_set(directory):
    try:
        with open(f"{directory}/texts.pkl", "rb") as fp:
            processed_texts = pickle.load(fp)
        
        with open(f"{directory}/labels.pkl", "rb") as fp:
            labels = pickle.load(fp)
    
    except:
        print(f'{directory} files not found. Please run the preprocess.ipynb before!')
    
    return processed_texts, labels

In [None]:
processed_texts, labels = load_set('train')
val_processed_texts, val_labels = load_set('val')
test_processed_texts, test_labels = load_set('test')

#### Tokenizing the posts

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
nlp.pipe_names

In [None]:
# for doc in nlp.pipe(["some text", "some other text"]):
#     print(doc._.trf_data.last_hidden_layer_state)

In [None]:
doc = nlp('I was reading the paper.')
print([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

In [None]:
def preprocess_set(set):
    texts = set.copy()['text']
    labels = set.copy()['class']
    texts = [' '.join(text.split()[:500]) for text in texts]

    docs = (doc for doc in (nlp.pipe(texts)))
    processed_texts = []
    for doc in docs:
        lemmas = [token.lemma_ for token in doc if not token.is_stop]
        processed_texts.append(' '.join(lemmas))
    
    labels = np.array(labels)

    return processed_texts, labels

In [None]:
len(strat_train_set)

In [None]:
processed_texts, labels = preprocess_set(strat_train_set)

In [None]:
processed_val_texts, val_labels = preprocess_set(strat_val_set)

#### Creating the Tokenizer

In [None]:
# from keras_preprocessing.text import Tokenizer       # type: ignore

# tokenizer = Tokenizer(num_words=3000, oov_token='<UNK>')
# tokenizer.fit_on_texts(processed_texts)

In [None]:
# sequences = tokenizer.texts_to_sequences(processed_texts)

In [None]:
# sequence_lengths = [len(sequence) for sequence in sequences if len(sequence) < 1000]

In [None]:
# plt.hist(sequence_lengths, bins=100, color='blue', align='left')
# plt.xlabel('Sequence Length')
# plt.ylabel('Number of Texts')
# plt.title('Distribution of Sequence Lengths')
# plt.xlim(0, 200)
# plt.xticks(range(0, max(sequence_lengths) + 1, 100))
# plt.show()

In [None]:
# from keras.preprocessing.sequence import pad_sequences

# max_len=200

# def get_sequences(tokenizer, texts):
#     sequences = tokenizer.texts_to_sequences(texts)
#     padded = pad_sequences(sequences, truncating='post', padding='post', maxlen=max_len)
#     return padded

# padded_train_seq = get_sequences(tokenizer, processed_texts)
# train_labels = np.array(labels)

In [None]:
# word_index = tokenizer.word_index

# sorted(word_index.items(), key=lambda x: x[1])[:10]

#### Creating the model

In [None]:
Sequential = keras.models.Sequential

model = Sequential([
    keras.layers.TextVectorization(max_tokens=3000, output_sequence_length=3000),
    keras.layers.Embedding(3000, 16),
    keras.layers.Bidirectional(keras.layers.LSTM(20, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.LSTM(20)),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
h = model.fit(
    processed_texts, labels,
    validation_data=(processed_val_texts, val_labels),
    epochs=20,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)
    ]
)

### Evaluating the Model

In [None]:
model.summary()

In [None]:
processed_test_texts, test_labels = preprocess_set(strat_test_set)

test_seq = get_sequences(tokenizer, processed_test_texts)
test_labels = np.array(test_labels)

In [None]:
test_seq, test_labels

In [None]:
_ = model.evaluate(test_seq, test_labels)

In [None]:
y_pred = model.predict(test_seq)
y_pred = (y_pred > 0.5).astype(int)

y_pred = np.array(list(map(lambda x: x[1], y_pred)))

In [None]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(test_labels, y_pred).ravel()

fpr = fp / (fp + tn)
print(f"False Positive Rate: {fpr:.4f}")

fnr = fn / (fn + tp)
print(f"False Negative Rate: {fnr:.4f}")

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_predictions(test_labels, y_pred, normalize="true",
                                        values_format=".0%")
plt.show()