In [1]:
import catboost
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jonas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
SEED = 1
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
N_LABELS = len(LABELS)

train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')
test_labels = pd.read_csv('./dataset/test_labels.csv')

In [28]:
# Pattern form filtering english stopwords, taken from https://stackoverflow.com/questions/19560498/faster-way-to-remove-stop-words-in-python
stopword_pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')

# Preprocessing
def preprocessComments(comment):
    # Remove leading and trailing spaces
    comment = comment.strip()

    # Remove stopwords
    comment = stopword_pattern.sub('', comment)
    
    # Remove numbers
    comment = re.sub(r'[0-9]', '', comment)

    # Remove anything that is not alphanumeric characters or underscore
    comment = re.sub(r'[^\w\s]', '', comment)

    # Remove consecutive spaces      
    comment = re.sub(r' +', ' ', comment)

    # Remove Newlines
    comment = re.sub(r'\n', ' ', comment)

    return comment

train.comment_text = train.comment_text.map(preprocessComments)
test.comment_text = test.comment_text.map(preprocessComments)


In [37]:
test_filtered = pd.merge(test, test_labels)
test_filtered = test_filtered.drop(test_filtered.index[test_filtered['toxic'] == -1])
test_filtered.shape

(63978, 8)

In [4]:
test_numpy = test_labels.to_numpy()
test_numpy = test_numpy[:, 1:]
test_numpy = test_numpy.astype('float32')

In [41]:
# Set aside a validation set of 20%
train_set, validation_set = train_test_split(train, test_size=0.2, random_state=SEED)

# Pool datassets into labels
def createPool(dataset, use_label=True):
    current = {}
    for label in LABELS:
        if use_label:
            current[label] = catboost.Pool(dataset[['comment_text']], text_features=['comment_text'], label=dataset[label])
        else:
            current[label] = catboost.Pool(dataset[['comment_text']], text_features=['comment_text'])
    return current


train_pools = createPool(train_set)
val_pools = createPool(validation_set)
test_pools = createPool(test_filtered)


In [6]:
text_proc_param = {
    'tokenizers': [{'tokenizer_id': 'Sense',
                    'separator_type': 'BySense',
                    'lowercasing': 'True',
                    'token_types':['Word'],
                    'sub_tokens_policy':'SeveralTokens'}],
    'dictionaries': [{'dictionary_id': 'Word',
                      'max_dictionary_size': '4000'}],
    'feature_calcers': ['BoW:top_tokens_count=3000']
}

In [7]:
# Can't use multilabel binary classification with text features in catboost yet due to a bug that will be fixed in version 1.1
# See: https://github.com/catboost/catboost/issues/1885
# We have to train a model for every label as a workaround

models = {}
for label in LABELS:
    print(label)
    models[label] = catboost.CatBoostClassifier(
        learning_rate=0.3,
        iterations=5000,
        eval_metric='F1',
        od_wait=350,
        od_type='Iter',
        random_seed=SEED)
        
    models[label].fit(
        train_pools[label], 
        eval_set=val_pools[label], 
        verbose=100,
        early_stopping_rounds=350, 
        use_best_model=True
    )

toxic
0:	learn: 0.4233687	test: 0.4174950	best: 0.4174950 (0)	total: 225ms	remaining: 18m 45s
100:	learn: 0.7082624	test: 0.6851595	best: 0.6851595 (99)	total: 8.2s	remaining: 6m 37s
200:	learn: 0.7391432	test: 0.7074017	best: 0.7074017 (200)	total: 16.3s	remaining: 6m 28s
300:	learn: 0.7597873	test: 0.7140945	best: 0.7148318 (263)	total: 24.3s	remaining: 6m 20s
400:	learn: 0.7749760	test: 0.7162649	best: 0.7178124 (392)	total: 32.4s	remaining: 6m 11s
500:	learn: 0.7870084	test: 0.7186851	best: 0.7197127 (477)	total: 54.4s	remaining: 8m 8s
600:	learn: 0.7955158	test: 0.7228598	best: 0.7233722 (597)	total: 1m 3s	remaining: 7m 43s
700:	learn: 0.8031852	test: 0.7260094	best: 0.7261793 (684)	total: 1m 11s	remaining: 7m 19s
800:	learn: 0.8097337	test: 0.7270346	best: 0.7276468 (766)	total: 1m 20s	remaining: 6m 59s
900:	learn: 0.8166029	test: 0.7282243	best: 0.7294469 (875)	total: 1m 28s	remaining: 6m 44s
1000:	learn: 0.8226911	test: 0.7298808	best: 0.7300168 (982)	total: 1m 37s	remaining: 6

In [46]:
predictions = np.ndarray((test_filtered.shape[0], N_LABELS))

(63978,)

In [49]:
predictions = np.ndarray((test_filtered.shape[0], N_LABELS))

avg = 0
for i, label in enumerate(LABELS):
    print(label, ":")
    predictions[:, i] = models[label].predict(test_pools[label])
    score = f1_score(test_filtered[label], predictions[:, i])
    print(score)
    avg += score

avg /= N_LABELS
print("Average f1-score:", avg)
    

toxic :
0.6411952724299412
severe_toxic :
0.37283236994219654
obscene :
0.647397296935635
threat :
0.2902208201892745
insult :
0.593626129697162
identity_hate :
0.3876567020250723
Average f1-score: 0.4888214318698802


In [120]:
# Save models

for label in LABELS:
    models[label].save_model("./catboost_models/" + label + "_model")