In [0]:
!pip install -U pip
!pip install -U pandas
!pip install -U scikit-learn
!pip install -U keras
!pip install -U keras-bert

In [0]:
import pandas as pd
import numpy as np
from pprint import pprint

In [0]:
%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

In [0]:
import logging
logging.basicConfig(level=logging.INFO)

In [0]:
from tqdm import autonotebook as tqdm

In [0]:
MAX_LEN = 256

In [0]:
#### Downloading Datasets
!wget https://dft-datasets.s3.us-east-2.amazonaws.com/ratings.zip
!wget https://dft-datasets.s3.us-east-2.amazonaws.com/insults.zip
  
!unzip ratings.zip
!unzip insults.zip

### Download Model Assets

In [0]:
from keras_bert import get_pretrained, PretrainedList, \
                       get_checkpoint_paths

In [0]:
paths = get_pretrained(PretrainedList.multi_cased_base)

In [0]:
checkpoint_paths = get_checkpoint_paths(paths)

In [0]:
with open(checkpoint_paths.vocab) as file:
    token_dict = {
        token.strip(): k
        for k, token in enumerate(file.readlines())
    }
    

### Tokenizer + Preparation of the dataset

In [0]:
from keras_bert import Tokenizer

In [0]:
tokenizer = Tokenizer(token_dict, cased=True)

In [0]:
tokenizer.encode("Rafael Sola de Paula", max_len=5)

In [0]:
sample_text = "Did you ever hear the tragedy of Darth Plagueis The Wise? I thought not. It's not a story the Jedi would tell you. It's a Sith legend. Darth Plagueis was a Dark Lord of the Sith, so powerful and so wise he could use the Force to influence the midichlorians to create life… He had such a knowledge of the dark side that he could even keep the ones he cared about from dying. The dark side of the Force is a pathway to many abilities some consider to be unnatural. He became so powerful… the only thing he was afraid of was losing his power, which eventually, of course, he did. Unfortunately, he taught his apprentice everything he knew, then his apprentice killed him in his sleep. Ironic. He could save others from death, but not himself."
pprint(sample_text, width=120)

In [0]:
tokens = tokenizer.tokenize(sample_text)
pprint(tokens, width=120, compact=True)

In [0]:
token_indices, _ = tokenizer.encode(sample_text, max_len=10)
pprint(token_indices, width=120, compact=True)

In [0]:
token_indices, _ = tokenizer.encode(sample_text, max_len=256)
pprint(token_indices, width=120, compact=True)

In [0]:
# train_set = pd.read_csv("ratings_train.csv")
# test_set = pd.read_csv("ratings_test.csv")

train_set = pd.read_csv("insults_train.csv")
test_set = pd.read_csv("insults_test.csv")

In [0]:
train_set.head()

In [0]:
def prepare_dataset(dataframe, tokenizer, max_length):
    size = len(dataframe)
    X = np.zeros((size, max_length))
    y = np.zeros((size,))
    examples = tqdm.tqdm(dataframe.iterrows(), total=size)

    for k, (_, (text, label)) in enumerate(examples):
        tokens, _ = tokenizer.encode(text, max_len=max_length)
        X[k, :] = tokens
        y[k] = label
    return X, y



In [0]:
X_train, y_train = prepare_dataset(train_set, tokenizer, MAX_LEN)
X_test, y_test = prepare_dataset(test_set, tokenizer, MAX_LEN)

In [0]:
X_train

### Baseline 1 - Random Forest

In [0]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, roc_curve
from sklearn import metrics

In [0]:
tf_idf = TfidfTransformer()
tf_idf_train = tf_idf.fit_transform(X_train)
tf_idf_test = tf_idf.transform(X_test)

In [0]:
clf = RandomForestClassifier(n_estimators=1000, max_depth=5, class_weight='balanced')
clf.fit(tf_idf_train, y_train)

In [0]:
y_pred = clf.predict(tf_idf_test)
s_pred = clf.predict_proba(tf_idf_test)[:, 1]

In [0]:
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, s_pred))

In [0]:
thresholds = np.linspace(s_pred.min(), s_pred.max()-1e-10, 101)
precision = np.array([metrics.precision_score(y_test, s_pred > threshold) for threshold in thresholds])
recall = np.array([metrics.recall_score(y_test, s_pred > threshold) for threshold in thresholds])
f1 = np.array([metrics.f1_score(y_test, s_pred > threshold) for threshold in thresholds])
tpr = np.array([(s_pred[y_test == 1] > threshold).mean() for threshold in thresholds])
fpr = np.array([(s_pred[y_test == 0] > threshold).mean() for threshold in thresholds])

best_f1 = thresholds[f1.argmax()]
print(classification_report(y_test, (s_pred > best_f1).astype(int)))

In [0]:
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(221)
ax.set_title(f"Recall x Precision")
ax.plot(precision, recall)

ax = fig.add_subplot(222)
ax.set_title("")
ax.plot(thresholds, precision, label="Precision", color="green")
ax.plot(thresholds, recall, label="Recall", color="blue")
ax.plot(thresholds, f1, label="F1", color="red")
ax.set_xlim(0, 1)
ax.legend()

ax = fig.add_subplot(223)
ax.set_title(f"ROC Curve - AUC")
ax.plot(fpr, tpr)

ax = fig.add_subplot(224)
ax.set_title("Positive Rates")
ax.plot(thresholds, tpr, label="TPR")
ax.plot(thresholds, fpr, label="FPR")
ax.set_xlim(0, 1)
ax.legend()
ax.axvline(0.57363415, color='k', ls='--')


fig.tight_layout()

### Baseline - LSTM

In [0]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from keras.optimizers import Adam

In [0]:
model = Sequential([
    Embedding(len(token_dict), 16, input_length=MAX_LEN),
    LSTM(8, activation="relu", return_sequences=True),
    LSTM(4, activation="relu", return_sequences=False),
    Dense(1, activation="sigmoid")
])

In [0]:

model.compile("adam", loss="binary_crossentropy", metrics=["accuracy"])

In [0]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10)

In [0]:
s_pred = model.predict_proba(X_test)[:, 0]
y_pred = model.predict_classes(X_test)[:, 0]

In [0]:
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, s_pred))

In [0]:
thresholds = np.linspace(s_pred.min() + 1e-5, s_pred.max() - 1e-5, 101)
precision = np.array([metrics.precision_score(y_test, s_pred > threshold) for threshold in thresholds])
recall = np.array([metrics.recall_score(y_test, s_pred > threshold) for threshold in thresholds])
f1 = np.array([metrics.f1_score(y_test, s_pred > threshold) for threshold in thresholds])
tpr = np.array([(s_pred[y_test == 1] > threshold).mean() for threshold in thresholds])
fpr = np.array([(s_pred[y_test == 0] > threshold).mean() for threshold in thresholds])

best_f1 = thresholds[f1.argmax()]
print(classification_report(y_test, (s_pred > best_f1).astype(int)))

In [0]:
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(221)
ax.set_title(f"Recall x Precision")
ax.plot(precision, recall)

ax = fig.add_subplot(222)
ax.set_title("")
ax.plot(thresholds, precision, label="Precision", color="green")
ax.plot(thresholds, recall, label="Recall", color="blue")
ax.plot(thresholds, f1, label="F1", color="red")
ax.set_xlim(0, 1)
ax.legend()

ax = fig.add_subplot(223)
ax.set_title(f"ROC Curve - AUC")
ax.plot(fpr, tpr)

ax = fig.add_subplot(224)
ax.set_title("Positive Rates")
ax.plot(thresholds, tpr, label="TPR")
ax.plot(thresholds, fpr, label="FPR")
ax.set_xlim(0, 1)
ax.legend()
ax.axvline(0.57363415, color='k', ls='--')


fig.tight_layout()

### BERT - Pretrained

In [0]:
from keras_bert import load_trained_model_from_checkpoint

In [0]:
checkpoint_paths.config

In [0]:
bert = load_trained_model_from_checkpoint(
    checkpoint_paths.config,
    checkpoint_paths.checkpoint,
    training=False,
    trainable=False,
    seq_len=MAX_LEN
)

In [0]:
bert.summary()

In [0]:
segments_train = np.zeros_like(X_train)
segments_test = np.zeros_like(X_test)

In [0]:
out = bert.predict([X_train[0:5, :], segments_train[0:5]])

In [0]:
out[0, :, :]

### Using BERT embeddings with LSTM


In [0]:
from keras.models import Model
from keras.layers import GlobalAveragePooling1D, Input

In [0]:
embedding_weights = bert.get_layer("Embedding-Token").get_weights()

In [0]:
model = Sequential([
    Embedding(len(token_dict), 768, input_length=MAX_LEN, weights=embedding_weights),
    LSTM(8, activation="relu", return_sequences=True),
    LSTM(4, activation="relu", return_sequences=False),
    Dense(1, activation="sigmoid")
])

In [0]:
model.compile("adam", loss="binary_crossentropy", metrics=["accuracy"])

In [0]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10)

In [0]:
s_pred = model.predict_proba(X_test)[:, 0]
y_pred = model.predict_classes(X_test)[:, 0]

In [0]:
print(roc_auc_score(y_test, s_pred))

In [0]:
y_pred = (s_pred > 0.5).astype(int)
print(classification_report(y_test, y_pred))

In [0]:
thresholds = np.linspace(s_pred.min() + 1e-5, s_pred.max() - 1e-5, 101)
precision = np.array([metrics.precision_score(y_test, s_pred > threshold) for threshold in thresholds])
recall = np.array([metrics.recall_score(y_test, s_pred > threshold) for threshold in thresholds])
f1 = np.array([metrics.f1_score(y_test, s_pred > threshold) for threshold in thresholds])
tpr = np.array([(s_pred[y_test == 1] > threshold).mean() for threshold in thresholds])
fpr = np.array([(s_pred[y_test == 0] > threshold).mean() for threshold in thresholds])

best_f1 = thresholds[f1.argmax()]
print(classification_report(y_test, (s_pred > best_f1).astype(int)))

In [0]:
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(221)
ax.set_title(f"Recall x Precision")
ax.plot(precision, recall)

ax = fig.add_subplot(222)
ax.set_title("")
ax.plot(thresholds, precision, label="Precision", color="green")
ax.plot(thresholds, recall, label="Recall", color="blue")
ax.plot(thresholds, f1, label="F1", color="red")
ax.set_xlim(0, 1)
ax.legend()

ax = fig.add_subplot(223)
ax.set_title(f"ROC Curve - AUC")
ax.plot(fpr, tpr)

ax = fig.add_subplot(224)
ax.set_title("Positive Rates")
ax.plot(thresholds, tpr, label="TPR")
ax.plot(thresholds, fpr, label="FPR")
ax.set_xlim(0, 1)
ax.legend()
ax.axvline(0.57363415, color='k', ls='--')


fig.tight_layout()

### Build Classifier

In [0]:
from keras.models import Model
from keras.layers import GlobalAveragePooling1D

In [0]:
inputs = bert.inputs
embeddings = bert(inputs)
flat = GlobalAveragePooling1D()(embeddings)
output = Dense(1, activation='sigmoid')(flat)

In [0]:
model = Model(inputs=bert.inputs, outputs=output)

In [0]:
model.predict([X_train[0:5, :], segments_train[0:5]])

In [0]:
model.compile("adam", loss="binary_crossentropy", metrics=["accuracy"])

In [0]:
model.fit(
    [X_train, segments_train], y_train, 
    validation_data=([X_test, segments_test], y_test),
    epochs=50, 
)

In [0]:
s_pred = model.predict([X_test, segments_test])

In [0]:
s_pred[0:10, 0]

In [0]:
print(roc_auc_score(y_test, s_pred[:, 0]))

In [0]:
y_pred = (s_pred > 0.5).astype(int)
print(classification_report(y_test, y_pred))

In [0]:
thresholds = np.linspace(s_pred.min() + 1e-5, s_pred.max() - 1e-5, 101)
precision = np.array([metrics.precision_score(y_test, s_pred > threshold) for threshold in thresholds])
recall = np.array([metrics.recall_score(y_test, s_pred > threshold) for threshold in thresholds])
f1 = np.array([metrics.f1_score(y_test, s_pred > threshold) for threshold in thresholds])
tpr = np.array([(s_pred[y_test == 1] > threshold).mean() for threshold in thresholds])
fpr = np.array([(s_pred[y_test == 0] > threshold).mean() for threshold in thresholds])

best_f1 = thresholds[f1.argmax()]
print(classification_report(y_test, (s_pred > best_f1).astype(int)))

In [0]:
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(221)
ax.set_title(f"Recall x Precision")
ax.plot(precision, recall)

ax = fig.add_subplot(222)
ax.set_title("")
ax.plot(thresholds, precision, label="Precision", color="green")
ax.plot(thresholds, recall, label="Recall", color="blue")
ax.plot(thresholds, f1, label="F1", color="red")
ax.axvline(best_f1, color='k', ls='--')
ax.set_xlim(0, 1)
ax.legend()

ax = fig.add_subplot(223)
ax.set_title(f"ROC Curve - AUC")
ax.plot(fpr, tpr)

ax = fig.add_subplot(224)
ax.set_title("Positive Rates")
ax.plot(thresholds, tpr, label="TPR")
ax.plot(thresholds, fpr, label="FPR")
ax.set_xlim(0, 1)
ax.legend()
ax.axvline(best_f1, color='k', ls='--')


fig.tight_layout()