In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import random

import gensim
from ufal.udpipe import Model, Pipeline

Loading already predefined list of toxic words with their lemmatized versions

In [None]:
with open('toxic_vocab_extended.txt', 'r') as file:
    toxic_words = file.readlines()
toxic_words = [sentence.strip() for sentence in toxic_words]

Also, we need udpipe model for tokenization.

In [None]:
modelfile = 'udpipe_syntagrus.model'
model_udpipe = Model.load(modelfile)
process_pipeline = Pipeline(model_udpipe, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')

In [None]:
def tokenize(text, tags=False, lemmas=False):
    processed = process_pipeline.process(text)
    content = [l for l in processed.split('\n') if not l.startswith('#')]
    tagged = [w.split('\t') for w in content if w]
    
    tokens = []
    for token in tagged:
        if token[3] == 'PUNCT':
            continue
        
        token_res = ''
        if lemmas:
            token_res = token[2]
        else:
            token_res = token[1]
        if tags:
            token_res += '_' + token[3]
        tokens.append(token_res)
        
    return tokens

Reading the input dataset

In [None]:
df = pd.read_csv('../../data/input/dev.tsv', sep='\t')
df.head()

In [None]:
toxic_inputs = df['toxic_comment'].tolist()

Inference:

In [None]:
results = []

for sample in tqdm(toxic_inputs):
    try:
        tokens_lemmas = tokenize(sample, lemmas=True)
    except:
        print(sample)
        tokens_lemmas = sample.split(' ')
    tokens = tokenize(sample, lemmas=False)
    cleaned_sentence = [tokens[i] for i, word in enumerate(tokens_lemmas) if word not in toxic_words]
        
    results.append(' '.join(cleaned_sentence))

Saving the results

In [None]:
with open('../../data/output/delete_dev.txt', 'w') as file:
    file.writelines([sentence+'\n' for sentence in results])