In [1]:
import re
import pandas as pd
from pymystem3 import Mystem

In [2]:
# get data
data = pd.read_csv('data.csv')

texts = list(data['comment'])
labels = list(map(int, data['toxic'].values))

In [3]:
# clean texts
texts = [re.sub('[^а-яё ]', ' ', str(t).lower()) for t in texts]
texts = [re.sub(r" +", " ", t).strip() for t in texts]

In [4]:
# lemmatize
mstm = Mystem()

normalized = [''.join(mstm.lemmatize(t)[:-1]) for t in texts]

In [5]:
# remove stopwords
with open('./stopwords.txt') as f:
    stopwords = [line.rstrip('\n') for line in f]

def drop_stop(text):
    tokens = text.split(' ')
    tokens = [t for t in tokens if t not in stopwords]
    return ' '.join(tokens)

normalized = [drop_stop(text) for text in normalized]

In [6]:
# new dataset
df = pd.DataFrame()
df['text'] = texts
df['norm'] = normalized
df['label'] = labels

In [7]:
# train-valid-test-split
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.3, random_state=42)
valid, test = train_test_split(test, test_size=0.5, random_state=42)

In [8]:
# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

model_tfidf = TfidfVectorizer(max_features=5000)

train_tfidf = model_tfidf.fit_transform(train['norm'].values)
valid_tfidf = model_tfidf.transform(valid['norm'].values)
test_tfidf = model_tfidf.transform(test['norm'].values)

In [9]:
# RF
from sklearn.ensemble import RandomForestClassifier

cls = RandomForestClassifier(random_state=42)
cls.fit(train_tfidf, train['label'].values)

RandomForestClassifier(random_state=42)

In [10]:
# prediction
predictions = cls.predict(test_tfidf)

In [11]:
# score
from sklearn.metrics import f1_score

f1_score(predictions, test['label'].values)

0.7585714285714286