# Stratify

In [1]:
import pandas as pd
import os

In [2]:
df = pd.read_csv("Inappropriate_09_top_vs_one_with_multi.csv")


In [3]:
len(df)

124597

In [3]:
THRESHOLD = 0.75
df_unsafe = df[df['inappropriate'] > THRESHOLD]
df_safe = df[df['inappropriate'] <= THRESHOLD]

In [4]:
for rs in [1,2,3,4]:
    df_unsafe_shuf = df_unsafe.sample(frac=1, random_state = rs).reset_index(drop=True)
    split_train = int(len(df_unsafe_shuf) * 0.8)
    split_val = int(len(df_unsafe_shuf) * 0.9)
    df_unsafe_shuf_train = df_unsafe_shuf[:split_train]
    df_unsafe_shuf_val = df_unsafe_shuf[split_train:split_val]
    df_unsafe_shuf_test= df_unsafe_shuf[split_val:]

    df_safe_shuf = df_safe.sample(frac=1, random_state = rs).reset_index(drop=True)
    split_train = int(len(df_safe_shuf) * 0.8)
    split_val = int(len(df_safe_shuf) * 0.9)
    df_safe_shuf_train = df_safe_shuf[:split_train]
    df_safe_shuf_val = df_safe_shuf[split_train:split_val]
    df_safe_shuf_test = df_safe_shuf[split_val:]

    df_tr = pd.concat([df_unsafe_shuf_train, df_safe_shuf_train])
    df_val = pd.concat([df_unsafe_shuf_val, df_safe_shuf_val])
    df_test = pd.concat([df_unsafe_shuf_test, df_safe_shuf_test])
    
    train_path = "train_randst{}.csv".format(rs)
    val_path = "val_randst{}.csv".format(rs)
    test_path = "test_randst{}.csv".format(rs)
    
    train_path = os.path.join("./splits",train_path)
    val_path = os.path.join("./splits",val_path)
    test_path = os.path.join("./splits",test_path)
    
    df_tr.to_csv(train_path, index = None)
    df_val.to_csv(val_path, index = None)
    df_test.to_csv(test_path, index = None)


#  Train

In [4]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
# import eli5
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

In [5]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\N.Babakov\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [7]:
df_tr_test = pd.read_csv("./splits/train_randst1.csv")
df_val_test = pd.read_csv("./splits/val_randst1.csv")
df_test_test = pd.read_csv("./splits/test_randst1.csv")

In [8]:
df_tr_val_test = pd.concat([df_tr_test, df_val_test])

In [32]:
def check_tfidf(train_val,test, sw = russian_stopwords, ngram_range=(1, 3), max_features=150000, text_label = 'text', debug = False):
    
    train_val['inappropriate'] = train_val['inappropriate'].apply(round)
    test['inappropriate'] = test['inappropriate'].apply(round)
    
    
    text_transformer = TfidfVectorizer(stop_words=sw, ngram_range=ngram_range, lowercase=True, max_features=max_features)
    X_train_text = text_transformer.fit_transform(train_val[text_label].tolist())
       
    X_test_text = text_transformer.transform(test[text_label].tolist())

    logit = LogisticRegression(C=5e1, solver='lbfgs', multi_class='multinomial', random_state=17, n_jobs=4)
    
    logit.fit(X_train_text, train_val['inappropriate'])
    test_preds = logit.predict(X_test_text)
    test_predsprob = logit.predict_proba(X_test_text)
    
    rauc = roc_auc_score(test['inappropriate'].tolist(), test_predsprob[:, 1])
    
    if debug == True:
        print("ngram_range {}, max_features {}".format(ngram_range, max_features))
        
        print(test_predsprob[:10])
        
        print(test_preds[:10])

        print(classification_report(test['inappropriate'].tolist(), test_preds))

        print(precision_recall_fscore_support(test['inappropriate'].tolist(),test_preds))
        
    precision, recall, fscore, _ = precision_recall_fscore_support(test['inappropriate'].tolist(),test_preds, average = 'weighted')

    return precision, recall, fscore, rauc
    
check_tfidf(df_tr_val_test, df_test_test, russian_stopwords, debug = True)

ngram_range (1, 3), max_features 150000
[[4.29258420e-01 5.70741580e-01]
 [8.30121961e-03 9.91698780e-01]
 [6.96101331e-04 9.99303899e-01]
 [9.60864987e-01 3.91350125e-02]
 [3.78419188e-01 6.21580812e-01]
 [6.92670770e-04 9.99307329e-01]
 [9.55115327e-01 4.48846726e-02]
 [8.34258001e-01 1.65741999e-01]
 [3.91414443e-01 6.08585557e-01]
 [2.23994005e-01 7.76005995e-01]]
[1 1 1 0 1 1 0 0 1 1]
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      9145
           1       0.62      0.54      0.58      3315

    accuracy                           0.79     12460
   macro avg       0.73      0.71      0.72     12460
weighted avg       0.78      0.79      0.79     12460

(array([0.8403125 , 0.62307692]), array([0.88212138, 0.53755656]), array([0.86070952, 0.57716599]), array([9145, 3315], dtype=int64))


(0.7825166783707866, 0.7904494382022472, 0.785272379356356, 0.7977413829644235)

In [6]:
def collect_metric(solve_func):
    collected_results = []
    for rs in [1,2,3,4]:
        df_tr_curr = pd.read_csv("./splits/train_randst{}.csv".format(rs))
        df_test_curr = pd.read_csv("./splits/test_randst{}.csv".format(rs))
        precision, recall, fscore,rocauc = solve_func(df_tr_curr, df_test_curr)
        collected_results.append([precision, recall, fscore, rocauc])
    return collected_results

In [34]:
tfidf_data = collect_metric(check_tfidf)

In [24]:
tfidf_data

[[0.7781494636020585, 0.7863563402889245, 0.7810390893377704],
 [0.7821026122965965, 0.7902086677367576, 0.7848847011294975],
 [0.7774199594577708, 0.7865971107544141, 0.7803986299249164],
 [0.774434350490292, 0.78330658105939, 0.7774869571284697]]

In [36]:
np.mean(tfidf_data, axis = 0)

array([0.7780266 , 0.78661717, 0.78095234, 0.79213747])

In [37]:
np.std(tfidf_data, axis = 0)

array([0.0027341 , 0.00244578, 0.00263564, 0.00202272])

In [7]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

In [14]:
def check_SVM(train_val,test, sw = russian_stopwords, ngram_range=(1, 3), max_features=150000, text_label = 'text', debug = False):
    
    train_val['inappropriate'] = train_val['inappropriate'].apply(round)
    test['inappropriate'] = test['inappropriate'].apply(round)
    
    text_transformer = TfidfVectorizer(stop_words=sw, ngram_range=ngram_range, lowercase=True, max_features=max_features)
    X_train_text = text_transformer.fit_transform(train_val[text_label].tolist())
       
    X_test_text = text_transformer.transform(test[text_label].tolist())

    logit = SGDClassifier(loss='log', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)
    
#     logit = SGDClassifier()
    
    logit.fit(X_train_text, train_val['inappropriate'])
    test_preds = logit.predict(X_test_text)
    
    test_predsprob = logit.predict_proba(X_test_text)
    rauc = roc_auc_score(test['inappropriate'].tolist(), test_predsprob[:, 1])
    
    if debug == True:
        print("ngram_range {}, max_features {}".format(ngram_range, max_features))
        
        print(test_preds[:10])

        print(classification_report(test['inappropriate'].tolist(), test_preds))

        print(precision_recall_fscore_support(test['inappropriate'].tolist(),test_preds))
    
    precision, recall, fscore, _ = precision_recall_fscore_support(test['inappropriate'].tolist(),test_preds, average = 'weighted')

    return precision, recall, fscore, rauc

In [15]:
tfidfSVM_data = collect_metric(check_SVM)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
tfidfSVM_data

[[0.5386805997490525,
  0.7339486356340289,
  0.6213339757346165,
  0.7190274173344319],
 [0.5386805997490525,
  0.7339486356340289,
  0.6213339757346165,
  0.718764995336571],
 [0.5386805997490525,
  0.7339486356340289,
  0.6213339757346165,
  0.720968426399874],
 [0.5386805997490525,
  0.7339486356340289,
  0.6213339757346165,
  0.7100174249789919]]

In [18]:
np.mean(tfidfSVM_data, axis = 0)

array([0.5386806 , 0.73394864, 0.62133398, 0.71719457])

In [19]:
np.std(tfidfSVM_data, axis = 0)

array([0.        , 0.        , 0.        , 0.00423022])

In [4]:
from sklearn.naive_bayes import MultinomialNB

In [14]:
def check_NB(train_val,test, sw = russian_stopwords, ngram_range=(1, 3), max_features=150000, text_label = 'text', debug = False):
    
    train_val['inappropriate'] = train_val['inappropriate'].apply(round)
    test['inappropriate'] = test['inappropriate'].apply(round)
    
    text_transformer = TfidfVectorizer(stop_words=sw, ngram_range=ngram_range, lowercase=True, max_features=max_features)
    X_train_text = text_transformer.fit_transform(train_val[text_label].tolist())
       
    X_test_text = text_transformer.transform(test[text_label].tolist())

    logit = MultinomialNB()
    
    logit.fit(X_train_text, train_val['inappropriate'])
    test_preds = logit.predict(X_test_text)
    
    test_predsprob = logit.predict_proba(X_test_text)
    rauc = roc_auc_score(test['inappropriate'].tolist(), test_predsprob[:, 1])
    
    if debug == True:
        print("ngram_range {}, max_features {}".format(ngram_range, max_features))
        
        print(test_preds[:10])

        print(classification_report(test['inappropriate'].tolist(), test_preds))

        print(precision_recall_fscore_support(test['inappropriate'].tolist(),test_preds))
    
    precision, recall, fscore, _ = precision_recall_fscore_support(test['inappropriate'].tolist(),test_preds, average = 'weighted')

    return precision, recall, fscore, rauc

In [15]:
tfidfMNB_data = collect_metric(check_NB)

In [16]:
np.mean(tfidfMNB_data, axis = 0)

array([0.78284437, 0.75234751, 0.66689949, 0.77320666])

In [17]:
np.std(tfidfMNB_data, axis = 0)

array([0.00431964, 0.00077475, 0.00111693, 0.00315644])

In [10]:

y_true = np.array([1,1,0])
y_pred = np.array([0,1,0])
precision_recall_fscore_support(y_true, y_pred, average='macro')

(0.75, 0.75, 0.6666666666666666, None)