In [50]:
import catboost
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jonas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [51]:
SEED = 1
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
N_LABELS = len(LABELS)

train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')
test_labels = pd.read_csv('./dataset/test_labels.csv')

In [52]:
# Pattern form filtering english stopwords, taken from https://stackoverflow.com/questions/19560498/faster-way-to-remove-stop-words-in-python
stopword_pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')

# Preprocessing
def preprocessComments(comment):
    # Remove leading and trailing spaces
    comment = comment.strip()

    # Remove stopwords
    comment = stopword_pattern.sub('', comment)
    
    # Remove numbers
    comment = re.sub(r'[0-9]', '', comment)

    # Remove anything that is not alphanumeric characters or underscore
    comment = re.sub(r'[^\w\s]', '', comment)

    # Remove consecutive spaces      
    comment = re.sub(r' +', ' ', comment)

    # Remove Newlines
    comment = re.sub(r'\n', ' ', comment)

    return comment

train.comment_text = train.comment_text.map(preprocessComments)
test.comment_text = test.comment_text.map(preprocessComments)


In [53]:
test_filtered = pd.merge(test, test_labels)
test_filtered = test_filtered.drop(test_filtered.index[test_filtered['toxic'] == -1])
test_filtered.shape

(63978, 8)

In [54]:
test_numpy = test_labels.to_numpy()
test_numpy = test_numpy[:, 1:]
test_numpy = test_numpy.astype('float32')

In [55]:
# Set aside a validation set of 20%
train_set, validation_set = train_test_split(train, test_size=0.2, random_state=SEED)

# Pool datassets into labels
def createPool(dataset, use_label=True):
    current = {}
    for label in LABELS:
        if use_label:
            current[label] = catboost.Pool(dataset[['comment_text']], text_features=['comment_text'], label=dataset[label])
        else:
            current[label] = catboost.Pool(dataset[['comment_text']], text_features=['comment_text'])
    return current


train_pools = createPool(train_set)
val_pools = createPool(validation_set)
test_pools = createPool(test_filtered)


In [56]:
# Can't use multilabel binary classification with text features in catboost yet due to a bug that will be fixed in version 1.1
# See: https://github.com/catboost/catboost/issues/1885
# We have to train a model for every label as a workaround

models = {}
for label in LABELS:
    print(label)
    models[label] = catboost.CatBoostClassifier(
        learning_rate=0.3,
        iterations=5000,
        eval_metric='F1',
        od_wait=350,
        od_type='Iter',
        random_seed=SEED)
        
    models[label].fit(
        train_pools[label], 
        eval_set=val_pools[label], 
        verbose=100,
        early_stopping_rounds=350, 
        use_best_model=True
    )

toxic
0:	learn: 0.6741429	test: 0.7066849	best: 0.7066849 (0)	total: 89.6ms	remaining: 7m 27s
100:	learn: 0.7377476	test: 0.7337473	best: 0.7347311 (98)	total: 10s	remaining: 8m 6s
200:	learn: 0.7590896	test: 0.7354064	best: 0.7378182 (143)	total: 20.1s	remaining: 8m 1s
300:	learn: 0.7716064	test: 0.7350055	best: 0.7378182 (143)	total: 31.6s	remaining: 8m 12s
400:	learn: 0.7834076	test: 0.7352194	best: 0.7378182 (143)	total: 53.2s	remaining: 10m 10s
Stopped by overfitting detector  (350 iterations wait)

bestTest = 0.7378181818
bestIteration = 143

Shrink model to first 144 iterations.
severe_toxic
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 86.1ms	remaining: 7m 10s
100:	learn: 0.4975342	test: 0.3348416	best: 0.3393665 (83)	total: 10.2s	remaining: 8m 12s
200:	learn: 0.5947068	test: 0.3333333	best: 0.3437500 (117)	total: 20.5s	remaining: 8m 9s
300:	learn: 0.6569563	test: 0.3454158	best: 0.3481953 (201)	total: 30.9s	remaining: 8m 1s
400:	learn: 0.7049101	test: 0.321353

In [57]:
predictions = np.ndarray((test_filtered.shape[0], N_LABELS))

In [58]:
predictions = np.ndarray((test_filtered.shape[0], N_LABELS))

avg = 0
for i, label in enumerate(LABELS):
    print(label, ":")
    predictions[:, i] = models[label].predict(test_pools[label])
    score = f1_score(test_filtered[label], predictions[:, i])
    print(score)
    avg += score

avg /= N_LABELS
print("Average f1-score:", avg)
    

toxic :
0.6399437412095641
severe_toxic :
0.36722306525037934
obscene :
0.63862167982771
threat :
0.37383177570093457
insult :
0.5780459957592563
identity_hate :
0.35173824130879344
Average f1-score: 0.49156741650943964


In [59]:
# Save models

for label in LABELS:
    models[label].save_model("./catboost_models/" + label + "_model")