In [None]:
https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [51]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import precision_recall_fscore_support
import statistics
import numpy as np

In [52]:
# df = pd.read_csv("train.csv")
# df_val = pd.read_csv("val.csv")
# df_test = pd.read_csv("test.csv")

# RST = 0
# RST = 1
RST = 3
df = pd.read_csv("./splits_final/train_randst{}.csv".format(RST))
df_val = pd.read_csv("./splits_final/val_randst{}.csv".format(RST))
df_test = pd.read_csv("./splits_final/test_randst{}.csv".format(RST))

In [53]:
def round_val(val):
    return round(val)
label_name = 'inappropriate'
threshold = 0
df = df[(df[label_name] >= 1-threshold)|(df[label_name] <=threshold)]
df_val = df_val[(df_val[label_name] >= 1-threshold)|(df_val[label_name] <=threshold)]
df_test = df_test[(df_test[label_name] >= 1-threshold) | (df_test[label_name] <=threshold)]

df[label_name] = df[label_name].apply(round_val)
df_val[label_name] = df_val[label_name].apply(round_val)
df_test[label_name] = df_test[label_name].apply(round_val)

In [54]:
>>> import pymorphy2
import re
>>> morph = pymorphy2.MorphAnalyzer()

from pymystem3 import Mystem
from string import punctuation
from tqdm import tqdm
mystem = Mystem() 

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\N.Babakov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
def preprocess_text(text):
    
    text = re.sub("[^а-яА-Я]"," ",text)
    text = re.sub(" +"," ",text)
    text = text.split()

    tokens = [morph.parse(w)[0].normal_form for w in text]
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text

preprocess_text("мама23663 мыла /!")

'мама мыло'

In [56]:
train_val = pd.concat([df, df_val])


In [57]:
processed = []
for t in tqdm(train_val['text'].tolist()):
    pr = preprocess_text(t)
    processed.append(pr)
    
train_val['processed'] = processed

100%|███████████████████████████████████████████████████████████████████████████| 95379/95379 [10:10<00:00, 156.35it/s]


In [58]:
processed = []
for t in tqdm(df_test['text'].tolist()):
    pr = preprocess_text(t)
    processed.append(pr)
df_test['processed'] = processed

100%|███████████████████████████████████████████████████████████████████████████| 10693/10693 [01:11<00:00, 150.47it/s]


In [59]:
# train_val.to_csv("train_val_simp_processed.csv",index = None)
# df_test.to_csv("test_simp_processed.csv",index = None)

train_val.to_csv("train_val_simp_processed_rst{}.csv".format(RST),index = None)
df_test.to_csv("test_simp_processed_rst{}.csv".format(RST),index = None)

In [60]:
# train_val = pd.read_csv("train_val_simp_processed.csv")
# df_test= pd.read_csv("test_simp_processed.csv")

train_val = pd.read_csv("train_val_simp_processed_rst{}.csv".format(RST))
df_test= pd.read_csv("test_simp_processed_rst{}.csv".format(RST))

In [None]:
# MNB 

In [61]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [62]:
text_clf.fit(train_val.processed.values.astype('U'), train_val.inappropriate.tolist())

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [63]:
predicted = text_clf.predict(df_test.processed.values.astype('U'))

In [64]:
predicted

array([1, 1, 0, ..., 0, 0, 0])

In [37]:
# is_correct = [i == j for i,j in zip(df_test['inappropriate'].tolist(), predicted)]
# df_res = pd.DataFrame({'correct':is_correct})
# df_res.to_csv("tf_idf_nb.csv",index = None)

In [79]:
precision_recall_fscore_support(df_test['inappropriate'], predicted, average='weighted')

(0.8216945860378545, 0.8285794444963995, 0.82328598742569, None)

In [19]:
# (0.7976464004625521, 0.7700899195456696, 0.6982848836679432, None) -- PBLIC
# (0.7976464004625521, 0.7700899195456696, 0.6982848836679432, None) --RST0
# (0.7918775952457945, 0.7710155952833777, 0.7008159000605697, None) --RST1
(0.8002019427256051, 0.7715327784531937, 0.701345281861393, None)

In [66]:
prec = [0.7976, 0.7918,0.80]
rec = [0.770,0.771, 0.7715]
fsc = [0.698, 0.7,0.7013]
get_metrics(prec, rec, fsc)

prec 0.7964666666666668 0.0034422215049135256
recal 0.7708333333333334 0.0006236095644623044
fsc 0.6997666666666666 0.0013572848714335194


In [67]:
print(classification_report(df_test['inappropriate'], predicted))

              precision    recall  f1-score   support

           0       0.77      0.99      0.87      7928
           1       0.90      0.13      0.23      2765

    accuracy                           0.77     10693
   macro avg       0.83      0.56      0.55     10693
weighted avg       0.80      0.77      0.70     10693



In [21]:
## SGD

In [68]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])


In [69]:
text_clf.fit(train_val.processed.values.astype('U'), train_val.inappropriate.tolist())

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [70]:
predicted = text_clf.predict(df_test.processed.values.astype('U'))
# is_correct = [i == j for i,j in zip(df_test['inappropriate'].tolist(), predicted)]
# df_res = pd.DataFrame({'correct':is_correct})
# df_res.to_csv("tf_idf_sgd.csv",index = None)

In [71]:
precision_recall_fscore_support(df_test['inappropriate'], predicted, average='weighted')

(0.8085405920600582, 0.7418872159356589, 0.6324305755690145, None)

In [None]:
# (0.8086577015494752, 0.7421675343114056, 0.6325234614624268, None -- PUBLIC
# (0.8086577015494752, 0.7421675343114056, 0.6325234614624268, None) --RST0
# (0.8091492600875875, 0.8176112590338531, 0.8110678250234805, None)--RST0
0.8085405920600582, 0.7418872159356589, 0.6324305755690145

In [47]:
def get_metrics(prec, recal, fsc):
    print("prec",np.mean(prec),np.std(prec))
    print("recal",np.mean(recal),np.std(recal))
    print("fsc",np.mean(fsc),np.std(fsc))

In [72]:
pr = [0.808, 0.809,0.808 ]
rec = [0.742,0.817,0.7418]
f_sc = [0.632,  0.811, 0.632]
get_metrics(pr, rec, f_sc)

prec 0.8083333333333332 0.00047140452079103207
recal 0.7669333333333332 0.03540257366664492
fsc 0.6916666666666668 0.08438140922159469


In [73]:
print(classification_report(df_test['inappropriate'], predicted))

              precision    recall  f1-score   support

           0       0.74      1.00      0.85      7928
           1       1.00      0.00      0.00      2765

    accuracy                           0.74     10693
   macro avg       0.87      0.50      0.43     10693
weighted avg       0.81      0.74      0.63     10693



In [80]:
# BERT_STD

In [81]:
f = [0.8005571591,
0.8075699641,
0.8178817601,
0.8278513865,
0.8296364764,
0.8302465235,
0.8400118972]
np.mean(f),np.std(f)

(0.8219650238428571, 0.012928157447163446)

In [82]:
p = [0.8347609946,
0.8381595759,
0.8419571719,
0.8444693817,
0.8453056798,
0.8475647549,
0.8508483176]
np.mean(p),np.std(p)

(0.8432951252, 0.005094773681746005)

In [83]:
r = [0.8063696426,
0.8126376419,
0.8239583333,
0.8322916667,
0.8338541667,
0.8346153846,
0.843006993]
np.mean(r),np.std(r)

(0.8266762612571429, 0.012127927102742882)

In [None]:
# LogisticRegression

In [74]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(C=5e1, solver='lbfgs', multi_class='multinomial', random_state=17, n_jobs=4)),])

In [75]:
text_clf.fit(train_val.processed.values.astype('U'), train_val.inappropriate.tolist())

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 LogisticRegression(C=50.0, multi_class='multinomial', n_jobs=4,
                                    random_state=17))])

In [76]:
predicted = text_clf.predict(df_test.processed.values.astype('U'))

In [77]:
precision_recall_fscore_support(df_test['inappropriate'], predicted, average='weighted')

(0.8216945860378545, 0.8285794444963995, 0.82328598742569, None)

In [28]:
# (0.8175494470282394, 0.8258400378608614, 0.8184311232990775, None) -- PUBLIC
# (0.8175494470282394, 0.8258400378608614, 0.8184311232990775, None) --RST0
# (0.8091492600875875, 0.8176112590338531, 0.8110678250234805, None)--RST1
0.8216945860378545, 0.8285794444963995, 0.82328598742569

In [78]:
prec = [0.8175, 0.8091,0.82169]
rec = [0.825, 0.817,0.828]
fsc = [0.818,  0.811,0.82328]
get_metrics(prec, rec, fsc)

prec 0.8160966666666667 0.0052347577679286
recal 0.8233333333333333 0.004642796092394711
fsc 0.8174266666666666 0.005029654284562908


In [None]:
is_correct = [i == j for i,j in zip(df_test['inappropriate'].tolist(), predicted)]
df_res = pd.DataFrame({'correct':is_correct})
df_res.to_csv("tf_idf_logreg.csv",index = None)

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(train_val.processed.tolist(), train_val.inappropriate.tolist())

In [None]:
gs_clf.best_score_

In [None]:
predicted = gs_clf.predict(df_test.processed.tolist())

In [None]:
precision_recall_fscore_support(df_test['inappropriate'], predicted, average='weighted')

In [None]:
print(classification_report(df_test['inappropriate'], predicted))