In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
# import eli5

## Data

In [2]:
df = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

In [3]:
def round_val(val):
    return round(val)
label_name = 'inappropriate'
threshold = 0
df = df[(df[label_name] >= 1-threshold)|(df[label_name] <=threshold)]
df_val = df_val[(df_val[label_name] >= 1-threshold)|(df_val[label_name] <=threshold)]
df_test = df_test[(df_test[label_name] >= 1-threshold) | (df_test[label_name] <=threshold)]

df[label_name] = df[label_name].apply(round_val)
df_val[label_name] = df_val[label_name].apply(round_val)
df_test[label_name] = df_test[label_name].apply(round_val)

## Tf-idf logreg

In [4]:
# https://www.kaggle.com/kashnitsky/logistic-regression-tf-idf-baseline

In [5]:
train_val = pd.concat([df, df_val])


In [12]:
train_val['text']

0        я думал что левиафаны - это те медленные страх...
2        Напоминаю, что пора искать актис невзрослого п...
3        курю лет пятнадцать никаких проблем кроме како...
4        окей, я тогда проведу парад гетеросексуалов, п...
5        Напоминаю ватникам, что в США общий налог всег...
                               ...                        
16327    Законно ли в РФ обменивать валюту на соответст...
16328    Алкоголь это петля которая затягивается с кажд...
16330    Ну вот эту "икону" я бы посмотрел.Фотография Р...
16331    И как избавился? Я не могу сам себя заставить ...
16332               Нет) Оба вторые - рестайл и дорестайл)
Name: text, Length: 95507, dtype: object

### No stopwords removal, no preprocessing

In [61]:
def check_tfidf(sw = None, ngram_range=(1, 3), max_features=150000, text_label = 'text'):
    
    print("ngram_range {}, max_features {}".format(ngram_range, max_features))
    
    text_transformer = TfidfVectorizer(stop_words=sw, ngram_range=ngram_range, lowercase=True, max_features=max_features)
    X_train_text = text_transformer.fit_transform(train_val[text_label].tolist())
    
#     print(X_train_text)
#     raise Exception("STOPE")
    
    X_test_text = text_transformer.transform(df_test[text_label])

    logit = LogisticRegression(C=5e1, solver='lbfgs', multi_class='multinomial', random_state=17, n_jobs=4)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

    cv_results = cross_val_score(logit, X_train_text, train_val['inappropriate'], cv=skf, scoring='f1_micro')

    print(cv_results, cv_results.mean())
    
    logit.fit(X_train_text, train_val['inappropriate'])
    test_preds = logit.predict(X_test_text)
    print(classification_report(df_test['inappropriate'], test_preds))


    
# check_tfidf()    

ngram_range (1, 2), max_features 100000
[0.79504764 0.79776987 0.79592691 0.80027224 0.79639809] 0.7970829504475575
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      7839
           1       0.63      0.52      0.57      2726

    accuracy                           0.80     10565
   macro avg       0.74      0.71      0.72     10565
weighted avg       0.79      0.80      0.79     10565

====================================================================================================
ngram_range (1, 2), max_features 150000
[0.79939273 0.801225   0.80247107 0.80194754 0.79943458] 0.8008941875531506
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      7839
           1       0.64      0.51      0.57      2726

    accuracy                           0.80     10565
   macro avg       0.74      0.71      0.72     10565
weighted avg       0.79      0.80      0.79     10565

====================================================================================================
ngram_range (1, 2), max_features 300000
[0.80164381 0.80201026 0.80608345 0.80346579 0.80252343] 0.8031453467978886
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      7839
           1       0.65      0.52      0.58      2726

    accuracy                           0.80     10565
   macro avg       0.75      0.71      0.72     10565
weighted avg       0.79      0.80      0.80     10565

ngram_range (1, 3), max_features 100000
[0.79269186 0.79174956 0.79576985 0.79461808 0.79566515] 0.794098901194495
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      7839
           1       0.62      0.52      0.56      2726

    accuracy                           0.79     10565
   macro avg       0.73      0.70      0.71     10565
weighted avg       0.78      0.79      0.79     10565

====================================================================================================
ngram_range (1, 3), max_features 150000
[0.79515234 0.7960423  0.79959164 0.79932988 0.79582221] 0.7971876739999025
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      7839
           1       0.63      0.52      0.57      2726

    accuracy                           0.80     10565
   macro avg       0.73      0.70      0.72     10565
weighted avg       0.79      0.80      0.79     10565

====================================================================================================
ngram_range (1, 3), max_features 300000
[0.79975919 0.80106795 0.80676404 0.80383226 0.79980106] 0.8022448999828435
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      7839
           1       0.64      0.51      0.57      2726

    accuracy                           0.80     10565
   macro avg       0.74      0.71      0.72     10565
weighted avg       0.79      0.80      0.79     10565

====================================================================================================
ngram_range (1, 4), max_features 100000
[0.79284892 0.79677521 0.79540338 0.79692163 0.79241925] 0.7948736754711991
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      7839
           1       0.63      0.50      0.56      2726

    accuracy                           0.80     10565
   macro avg       0.74      0.70      0.71     10565
weighted avg       0.78      0.80      0.79     10565

====================================================================================================
ngram_range (1, 4), max_features 150000
[0.79761282 0.79824102 0.79932988 0.800534   0.80016753] 0.7991770498524968
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      7839
           1       0.62      0.52      0.57      2726

    accuracy                           0.80     10565
   macro avg       0.73      0.71      0.72     10565
weighted avg       0.79      0.80      0.79     10565

====================================================================================================
ngram_range (1, 4), max_features 300000
[0.7965658  0.80023034 0.80226166 0.80372755 0.79959164] 0.800475401328234
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      7839
           1       0.64      0.52      0.57      2726

    accuracy                           0.80     10565
   macro avg       0.74      0.71      0.72     10565
weighted avg       0.79      0.80      0.79     10565

====================================================================================================

In [27]:
for ngram in [(1,3),(1,4)]:
    for feat in [100000, 150000, 300000]:
        check_tfidf(ngram, feat)    
        print("="*100)

ngram_range (1, 3), max_features 100000
[0.79269186 0.79174956 0.79576985 0.79461808 0.79566515] 0.794098901194495
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      7839
           1       0.62      0.52      0.56      2726

    accuracy                           0.79     10565
   macro avg       0.73      0.70      0.71     10565
weighted avg       0.78      0.79      0.79     10565

ngram_range (1, 3), max_features 150000
[0.79515234 0.7960423  0.79959164 0.79932988 0.79582221] 0.7971876739999025
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      7839
           1       0.63      0.52      0.57      2726

    accuracy                           0.80     10565
   macro avg       0.73      0.70      0.72     10565
weighted avg       0.79      0.80      0.79     10565

ngram_range (1, 3), max_features 300000
[0.79975919 0.80106795 0.80676404 0.80383226 0.79980106] 0.8022448999828435

### With stopwords removal still no preprocessing

In [29]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.6.2-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.6.2




In [9]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\N.Babakov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
for ngram in [(1,3),(1,4)]:
    for feat in [100000, 300000]:
        check_tfidf(russian_stopwords, ngram, feat)    
        print("="*100)

ngram_range (1, 3), max_features 100000
[0.79489059 0.79719401 0.79817811 0.79959164 0.79760222] 0.7974913137050577
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      7839
           1       0.64      0.52      0.57      2726

    accuracy                           0.80     10565
   macro avg       0.74      0.71      0.72     10565
weighted avg       0.79      0.80      0.79     10565

ngram_range (1, 3), max_features 300000
[0.79939273 0.8014344  0.80419873 0.80430344 0.80487933] 0.8028417273740798
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      7839
           1       0.63      0.54      0.58      2726

    accuracy                           0.80     10565
   macro avg       0.74      0.71      0.72     10565
weighted avg       0.79      0.80      0.79     10565

ngram_range (1, 4), max_features 100000
[0.79557114 0.79598995 0.80058636 0.79901576 0.8013193 ] 0.798496502161215

## Preprocess

In [None]:
# !pip install pymystem3

In [6]:
>>> import pymorphy2
import re
>>> morph = pymorphy2.MorphAnalyzer()

In [52]:
t = re.sub("[^а-яА-Я]"," ","мама23663 мыла /!")
t = re.sub(" +"," ",t)
t = t.split()
t

['мама', 'мыла']

In [45]:
morph.parse("бреет")

[Parse(word='бреет', tag=OpencorporaTag('VERB,impf,tran sing,3per,pres,indc'), normal_form='брить', score=1.0, methods_stack=((DictionaryAnalyzer(), 'бреет', 569, 5),))]

In [7]:
from pymystem3 import Mystem
from string import punctuation
from tqdm import tqdm
mystem = Mystem() 

In [10]:
def preprocess_text(text):
    
    text = re.sub("[^а-яА-Я]"," ",text)
    text = re.sub(" +"," ",text)
    text = text.split()

    tokens = [morph.parse(w)[0].normal_form for w in text]
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text

preprocess_text("мама23663 мыла /!")

'мама мыло'

In [11]:
processed = []
for t in tqdm(train_val['text'].tolist()):
    pr = preprocess_text(t)
    processed.append(pr)
    
train_val['processed'] = processed

100%|███████████████████████████████████████████████████████████████████████████| 95507/95507 [05:20<00:00, 297.59it/s]


In [12]:
processed = []
for t in tqdm(df_test['text'].tolist()):
    pr = preprocess_text(t)
    processed.append(pr)
df_test['processed'] = processed

100%|███████████████████████████████████████████████████████████████████████████| 10565/10565 [00:35<00:00, 297.96it/s]


In [62]:
for ngram in [(1,3),(1,4)]:
    for feat in [100000, 300000]:
        check_tfidf(russian_stopwords, ngram, feat, 'processed')    
        print("="*100)

ngram_range (1, 3), max_features 100000
[0.81310858 0.81384148 0.81681587 0.81749647 0.82262709] 0.8167778969681422
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      7839
           1       0.67      0.58      0.62      2726

    accuracy                           0.82     10565
   macro avg       0.77      0.74      0.75     10565
weighted avg       0.81      0.82      0.81     10565

ngram_range (1, 3), max_features 300000
[0.81902419 0.82106586 0.81885765 0.82142296 0.82241767] 0.8205576661950378
              precision    recall  f1-score   support

           0       0.87      0.90      0.88      7839
           1       0.68      0.61      0.64      2726

    accuracy                           0.83     10565
   macro avg       0.78      0.75      0.76     10565
weighted avg       0.82      0.83      0.82     10565

ngram_range (1, 4), max_features 100000
[0.81143336 0.81316093 0.81639705 0.81770588 0.81582116] 0.814903674213043

ngram_range (1, 3), max_features 100000
[0.81310858 0.81384148 0.81681587 0.81749647 0.82262709] 0.8167778969681422
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      7839
           1       0.67      0.58      0.62      2726

    accuracy                           0.82     10565
   macro avg       0.77      0.74      0.75     10565
weighted avg       0.81      0.82      0.81     10565

====================================================================================================
ngram_range (1, 3), max_features 300000
[0.81902419 0.82106586 0.81885765 0.82142296 0.82241767] 0.8205576661950378
              precision    recall  f1-score   support

           0       0.87      0.90      0.88      7839
           1       0.68      0.61      0.64      2726

    accuracy                           0.83     10565
   macro avg       0.78      0.75      0.76     10565
weighted avg       0.82      0.83      0.82     10565

====================================================================================================
ngram_range (1, 4), max_features 100000
[0.81143336 0.81316093 0.81639705 0.81770588 0.81582116] 0.8149036742130431
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      7839
           1       0.68      0.58      0.63      2726

    accuracy                           0.82     10565
   macro avg       0.77      0.74      0.75     10565
weighted avg       0.81      0.82      0.82     10565

====================================================================================================
ngram_range (1, 4), max_features 300000
[0.81525495 0.8157261  0.82356945 0.81911942 0.82241767] 0.8192175176058939
              precision    recall  f1-score   support

           0       0.87      0.90      0.88      7839
           1       0.68      0.60      0.64      2726

    accuracy                           0.82     10565
   macro avg       0.77      0.75      0.76     10565
weighted avg       0.82      0.82      0.82     10565

In [14]:
from sklearn.svm import SVC

In [None]:
def check_SVM(ngram_range=(1, 3), max_features=150000, kernel = 'linear'):
    
    print("ngram_range {}, max_features {}".format(ngram_range, max_features))
    
    text_transformer = TfidfVectorizer(stop_words=russian_stopwords, ngram_range=ngram_range, lowercase=True, max_features=max_features)
    X_train_text = text_transformer.fit_transform(train_val['processed'].tolist())
    
    X_test_text = text_transformer.transform(df_test['processed'])

#     logit = LogisticRegression(C=5e1, solver='lbfgs', multi_class='multinomial', random_state=17, n_jobs=4)
    logit = SVC(kernel = kernel)
    print("fitting ...")
    logit.fit(X_train_text, train_val['inappropriate'])
    test_preds = logit.predict(X_test_text)
    print(classification_report(df_test['inappropriate'], test_preds))
    
    check_SVM
    
check_SVM()

ngram_range (1, 3), max_features 150000
fitting ...
