# Классификация комментариев

## Подготовка

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import nltk
from nltk.stem import WordNetLemmatizer
from pymystem3 import Mystem
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings('ignore')

In [2]:
try: df = pd.read_csv('/datasets/toxic_comments.csv')
except: df = pd.read_csv('C:/Documents/datasets/toxic_comments.csv')

In [3]:
display(df)
df.info()
df.describe()

Unnamed: 0.1,Unnamed: 0,text,toxic
0,0,Explanation\nWhy the edits made under my usern...,0
1,1,D'aww! He matches this background colour I'm s...,0
2,2,"Hey man, I'm really not trying to edit war. It...",0
3,3,"""\nMore\nI can't make any real suggestions on ...",0
4,4,"You, sir, are my hero. Any chance you remember...",0
...,...,...,...
159287,159446,""":::::And for the second time of asking, when ...",0
159288,159447,You should be ashamed of yourself \n\nThat is ...,0
159289,159448,"Spitzer \n\nUmm, theres no actual article for ...",0
159290,159449,And it looks like it was actually you who put ...,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159292 entries, 0 to 159291
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  159292 non-null  int64 
 1   text        159292 non-null  object
 2   toxic       159292 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.6+ MB


Unnamed: 0.1,Unnamed: 0,toxic
count,159292.0,159292.0
mean,79725.697242,0.101612
std,46028.837471,0.302139
min,0.0,0.0
25%,39872.75,0.0
50%,79721.5,0.0
75%,119573.25,0.0
max,159450.0,1.0


In [4]:
df['toxic'].value_counts()

0    143106
1     16186
Name: toxic, dtype: int64

In [5]:
df = df.drop('Unnamed: 0', axis=1)

In [6]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    text = text.lower()
    cleared_text = re.sub(r'[^a-zA-Z]', ' ', text) 
    clear_text = " ".join(cleared_text.split())
    return "".join(lemmatizer.lemmatize(clear_text))

df['text'] = df['text'].apply(lemmatize_text)

In [7]:
target = df['toxic']
features = df.drop('toxic', axis=1)

features_train, features_test, target_train, target_test = train_test_split(features, target, 
                                                                           test_size=0.25, random_state=12345,
                                                                           stratify = target)

In [8]:
nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('english'))

count_tf_idf = TfidfVectorizer(stop_words=stopwords)

features_train = count_tf_idf.fit_transform(features_train['text'])
features_test = count_tf_idf.transform(features_test['text'])
print(features_train.shape)
print(features_test.shape)

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(119469, 142639)
(39823, 142639)


## Обучение

In [9]:
regression = LogisticRegression(class_weight='balanced', random_state=12345)

regression_parametrs = {'C': [0.1, 1, 10],
                        'solver': ['newton-cg', 'lbfgs', 'liblinear']
                       }

regression_grid = GridSearchCV(regression, regression_parametrs, scoring='f1', cv=3)
regression_grid.fit(features_train, target_train)


print('Лучшие параметры:', regression_grid.best_params_)
print('Лучшее значение f1 при лучших параметрах:', regression_grid.best_score_)

regression.fit(features_train, target_train)
regression_cv_score = cross_val_score(regression, features_train, target_train, scoring='f1',cv=3).mean()
print('Среднее качество модели Логистической регрессии на кросс-валидации:', regression_cv_score)

Лучшие параметры: {'C': 10, 'solver': 'newton-cg'}
Лучшее значение f1 при лучших параметрах: 0.7661716245114881
Среднее качество модели Логистической регрессии на кросс-валидации: 0.7485568753878408


In [10]:
forest = RandomForestClassifier(class_weight='balanced', n_jobs=-1)

forest_parametrs = { 'n_estimators': range(50, 80, 5),
                     'max_depth': range(4, 8, 2)}

forest_grid = GridSearchCV(forest, forest_parametrs, scoring='f1', cv=3)
forest_grid.fit(features_train, target_train)

print('Лучшие параметры:', forest_grid.best_params_)
print('Лучшее значение f1 при лучших параметрах:', forest_grid.best_score_)

Лучшие параметры: {'max_depth': 6, 'n_estimators': 55}
Лучшее значение f1 при лучших параметрах: 0.35701332050042406


In [11]:
tree = DecisionTreeClassifier(class_weight='balanced', random_state=12345)

tree_parametrs = {'max_depth':[x for x in range(50, 80, 5)]}


tree_grid = GridSearchCV(tree, tree_parametrs, scoring='f1', cv=3)
tree_grid.fit(features_train, target_train)

print('Лучшие параметры:', tree_grid.best_params_)
print('Лучшее значение f1 при лучших параметрах:', tree_grid.best_score_)

Лучшие параметры: {'max_depth': 75}
Лучшее значение f1 при лучших параметрах: 0.6270453799865565


In [12]:
regression_best = LogisticRegression(class_weight='balanced', random_state=12345, C=10, solver= 'newton-cg')
regression_best.fit(features_train, target_train)
print('Значение f1 на тестоых данных при лучших параметрах лучшей модели:', f1_score(target_test, 
                                                                             regression_best.predict(features_test)))

Значение f1 на тестоых данных при лучших параметрах лучшей модели: 0.77043255217442
