In [1]:
import copy
import joblib
import numpy as np
import pandas as pd
import re
import string
import torch

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModel, pipeline

In [2]:
# Helper methods
def train_nn(X_train, y_train):
    clf = MLPClassifier(
        hidden_layer_sizes=(256, 64, 16), max_iter=150, activation='relu', 
        solver='adam', alpha=1e-5, early_stopping=True)
    clf.fit(X_train, y_train)
    return clf

def print_results(X_train, X_test, y_train, y_test):
    print("Train acc:\t{0:.3f}".format(clf.score(X_train, y_train)))
    print("Test acc:\t{0:.3f}".format(clf.score(X_test, y_test)))
    print(classification_report(y_train, clf.predict(X_train)))
    print(classification_report(y_test, clf.predict(X_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))

In [3]:
labels = pd.read_csv("sinkaf/data/troff-v1.0.tsv",  sep='\t')['label'] != 'non'
print(labels.value_counts())

False    28439
True      6845
Name: label, dtype: int64


In [4]:
# Onceden egitilmis Turkce BERT modeli kullanilarak deney derlemi vektore cevirilmistir
# https://drive.google.com/file/d/1fq_Vkvg0QFpZaG1EgwdhyXYNSLqdu2tq/view?usp=sharing
# Dosyayi indirip sinkaf/data'ya kopyalayiniz 
bert_data = pd.read_csv("sinkaf/data/bert_data.csv", header=None)

FileNotFoundError: [Errno 2] No such file or directory: 'sinkaf/data/bert_data.csv'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bert_data, labels)

In [None]:
# Az olan siniftaki degerlerden ornek uretimi
smote = SMOTE(sampling_strategy=1)
X_train, y_train = smote.fit_sample(X_train, y_train)

In [None]:
# basit NN egitimi
clf = train_nn(X_train, y_train)

In [None]:
# Deney setinde %97, test setinde %82'a yakin basari olmasina ragmen
# model kufur iceren cumlelerde dusuk basari gostermektedir. Test setindeki
# veri dengesizligi sebebi ile basarisi yuksek cikmaktadir
print_results(X_train, X_test, y_train, y_test)

In [None]:
#Veriyi direkt olarak kullanma
X_train, X_test, y_train, y_test = train_test_split(bert_data, labels)
clf = train_nn(X_train, y_train)
print_results(X_train, X_test, y_train, y_test)

In [None]:
joblib.dump(clf, "sinkaf/data/clf_nn_precision.joblib")

In [None]:
# Kufurlu veriyi aza ornekleme
# Undersampling non offensive data
undersampler = RandomUnderSampler()
bert_under, labels_under = undersampler.fit_resample(bert_data, labels)
print(f"Normal length: {len(bert_data)}, Undersampled length: {len(bert_under)}")

In [None]:
# Aza orneklenen model denemesi
X_train, X_test, y_train, y_test = train_test_split(bert_under, labels_under, stratify=labels_under)
clf = train_nn(X_train, y_train)
print_results(X_train, X_test, y_train, y_test)

In [None]:
# Final model hazirlanmasi
# Aza orneklenen veri kullanilmistir
# Rastgele secilen datalardan maximum basariya sahip olan model kullanilmistir
undersampler = RandomUnderSampler()
best_clf = None
best_acc = 0
for i in range(15):
    bert_under, labels_under = undersampler.fit_resample(bert_data, labels)
    X_train, X_test, y_train, y_test = train_test_split(bert_under, labels_under, stratify=labels_under)
    clf = train_nn(X_train, y_train)
    acc = clf.score(bert_under, labels_under)
    print(acc)
    if acc > best_acc:
        best_acc = acc
        best_clf = copy.deepcopy(clf)

In [None]:
print(best_acc)
joblib.dump(best_clf, "sinkaf/data/clf_nn_recall.joblib")

In [None]:
# Pre-trained BERT kullanarak vektore cevirme
# Tek seferlik bir yukleme yapicak
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
bert = AutoModel.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")

In [None]:
# Cumleleri vektore cevirirken eksik kisimlarda maximum uzunluga gore padding yapilmistir
# Max uzunluk sinkaf datasi icin 113
MAX_SENTENCE_TOKEN_LENGTH = 113

In [None]:
# Offensive? - Kufur mu?

test = [
    "guzel karisin ha", 
    "cok guzelsin", 
    "yaz transfer sezonuna lionel messi damga vuracak gibi gözüküyor", 
    "dal sarkar kartal kalkar",
    "amk cocugu",
    "aq bebesindeki havaya bak sen",
    "kral cocuk bizim alper",
    "erol bulut istifa",
    "sen kendini ne saniyorsun kopek"]

In [None]:
tokenized = [tokenizer.encode(s, add_special_tokens=True) for s in test]
padded = np.array([s + [0]*(MAX_SENTENCE_TOKEN_LENGTH-len(s)) for s in tokenized])
input_ids = torch.tensor(np.array(padded)).to(torch.int64)

In [None]:
# Cumleleri vektore cevirme
def sentence_2_vec(input_id):
    with torch.no_grad():
        last_hidden_states = bert(input_id)
        features = last_hidden_states[0][:,0,:].numpy()
    return features

In [None]:
test_vector = sentence_2_vec(input_ids)
clf.predict(test_vector)