In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
sample_submission = pd.read_csv("./data/sample_submission.csv")
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

## Подробнее о метках (labels)
Все они принимают значение -1, 0, 1
* Insult - Оскорбление
* Obscene - Непристойный
* Severe toxicity - Сильная токсичность
* Toxicity - Токсичность
* Threat - Угроза
* Identity hate - Ненависть к идентичности

In [3]:
sample_submission.head(1)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5


In [4]:
train.head(1)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0


In [5]:
test.head(1)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...


In [6]:
# То, что в этом выводе строки не пустые нам говорит о том, что toxic необязательно содержит в себе остальные метки.
# train.loc[(train['toxic'] > 0) & ((train['severe_toxic'] == 0) & (train['obscene'] == 0) & (train['threat'] == 0) & (train['insult'] == 0) & (train['identity_hate'] == 0))]

In [7]:
# А это говорит о том, что оскорбления, брань и прочее могут присутствовать даже в нетоксичных комментариях
# train.loc[(train['toxic'] == 0) & ((train['severe_toxic'] != 0) | (train['obscene'] != 0) | (train['threat'] != 0) | (train['insult'] != 0) | (train['identity_hate'] != 0))]

In [8]:
# просто статистика по данным, можем оценить, какие значения у нас бывают
# cols = "toxic severe_toxic obscene threat insult identity_hate"
# for i in cols.split():
#     print(train[i].value_counts())

In [9]:
# Тут мы создаём серию отфильтрованных комментариев, чтобы попробовать работать с ними, а не с сырым материалом

import re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(sen):
    # Удаление символов пунктуации и специальных символов
    sentence = re.sub('[^a-zA-Z0-9]', ' ', sen)
    # Заменяем любые пробелы на один обычный
    sentence = re.sub(r'\s+', ' ', sentence)
    # Делаем всё в нижнем регистре
    sentence = sentence.lower()
    # Удаляем стоп-слова
    sentence = " ".join([word for word in sentence.split() if word not in stop_words])
    return sentence

filtered_train = train['comment_text'].apply(preprocess_text)
filtered_test = test['comment_text'].apply(preprocess_text)

In [10]:
# Тут я забавы ради нарисовал облачко, но оно ничего особо не дало, потому идём дальше
# from collections import Counter
# from wordcloud import WordCloud, ImageColorGenerator
# import matplotlib.pyplot as plt
# all_words = " ".join(filtered_train).split()
# word_counts = Counter(all_words)
# top_words = dict(word_counts.most_common(100))
# wordcloud = WordCloud(background_color="white")
# wordcloud.generate_from_frequencies(top_words)
# plt.figure(figsize=(10,10))
# plt.axis('off')
# plt.imshow(wordcloud)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
def make_tf_idf(docs: pd.Series, stop_words=None):
    return TfidfVectorizer(stop_words=stop_words).fit_transform(docs)

In [12]:
tfidf_train = make_tf_idf(filtered_train)
tfidf_test = make_tf_idf(filtered_test)
# разделение данных на обучающую и тестовую выборки
targets_train = train.iloc[:, 2:]
X_train, X_test, y_train, y_test = train_test_split(tfidf_train, targets_train, test_size=0.2)


In [14]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
def rate(y_pred, y_test):
    print('* Accuracy:', accuracy_score(y_test, y_pred))
    print('* Precision:', precision_score(y_test, y_pred, average='weighted'))
    print('* Recall:', recall_score(y_test, y_pred, average='weighted'))
    print('* F1-score:', f1_score(y_test, y_pred, average='weighted'))

In [33]:
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
# Создаем LinearSVC и оборачиваем его в OneVsRestClassifier
# создание и обучение модели
# rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=6, max_depth=100) # не подходит, тк ждет опр кол-во признаков, а слов может быть произвольное кол-во
logreg_ovr = OneVsRestClassifier(LogisticRegression(multi_class='ovr'))
catboost = CatBoostClassifier(loss_function='MultiCrossEntropy')
svm_ovr = OneVsRestClassifier(LinearSVC(multi_class='ovr'))

In [40]:
model = svm_ovr
model.fit(X_train, y_train)

OneVsRestClassifier(estimator=LinearSVC())

In [41]:
y_pred = model.predict(X_test)
rate(y_pred, y_test)

* Accuracy: 0.9190976030079899
* Precision: 0.8288784258413564
* Recall: 0.6146920108617979
* F1-score: 0.7014125742142919


Теперь надо сделать submission.

In [17]:
# посмотрим, как он выглядит
sample_submission.head(1)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5


In [39]:
# Переведем в tf-idf матрицу тестовые комментарии, чтобы модель понимала данные
tfidf_test = make_tf_idf(test.comment_text)

In [42]:
model = svm_ovr
model.predict(tfidf_test)

ValueError: X has 246855 features, but LinearSVC is expecting 182086 features as input.

In [74]:
np.hstack((catboost.predict(test), np.array(test.id).reshape(-1, 1)))

AttributeError: module 'catboost' has no attribute 'predict'

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer



Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."
