In [1]:
import sys
sys.path.append('../utilities/')

In [2]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import re
from sklearn.feature_extraction.text import CountVectorizer



In [51]:
orig_df = pd.read_csv('../data/initial_datasets/dota2_test.csv')
gen_df = pd.read_csv('../data/generated/dota2/control/control_gen.csv')
orig_df = orig_df.sample(n=1000)

In [52]:
orig_df

Unnamed: 0,translated_message,label
317,"""good luck next""",0
285,"""MY EARTHSHAKER STAY BEHIND WITH ECHO""",0
167,"""and so""",0
920,17,0
258,"""Will he be coming?""",0
...,...,...
387,"""go defend""",0
418,some day,0
759,jakiro,0
83,"""Fucking idiot""",0


In [53]:
gen_df

Unnamed: 0,sentences,labels
0,I'm reporting u aftr game,0
1,"nice play team, wp!",1
2,wtf r u doin?,0
3,"keep it up, ur doing gr8!",1
4,lol uninstall pls,0
...,...,...
995,"let's push mid, we got this",1
996,"u can't last hit, uninstall",0
997,nice save back there,1
998,"stop throwing, wtf",0


In [54]:
orig_df = orig_df.rename(columns={'translated_message': 'sentences', 'label': 'labels'})
orig_df['labels'] = orig_df['labels'].fillna(0)
orig_df['labels'] = orig_df['labels'].replace({'x': 1})
orig_df['labels'] = orig_df['labels'].astype(int)
orig_df = orig_df[['sentences', 'labels']]

# **Tools**

In [55]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **MMD**

In [56]:
X = np.array(sentence_transformer.encode(orig_df['sentences'].to_list()))
Y = np.array(sentence_transformer.encode(gen_df['sentences'].to_list()))

In [57]:
def report_MMD(X, Y, normalize=False):
    
    tensorX = torch.tensor(X)
    tensorY = torch.tensor(Y)
    rbf_mmd = MMD(tensorX, tensorY, "rbf")
    scale_mmd = MMD(tensorX, tensorY, "multiscale")


    if normalize:
        return (rbf_mmd.item() / np.sqrt((1.0 / X.shape[0]) + (1.0 / Y.shape[0]))), (scale_mmd.item() / np.sqrt(1.0 / X.shape[0] + 1.0 / Y.shape[0]))
    else:
        return rbf_mmd.item(), scale_mmd.item()

In [58]:
report_MMD(X, Y, normalize=True)

(0.8111742953883601, 3.2885409826869734)

In [60]:
import numpy as np
import ot

def wasserstein(X, Y):
    
    a = np.ones((X.shape[0],)) / X.shape[0]
    b = np.ones((Y.shape[0],)) / Y.shape[0]

    M = ot.dist(X, Y)
    M /= M.max()

    return ot.emd2(a, b, M)

In [61]:
wasserstein(X, Y)

Python(49215) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0.5633513201996674

# **Classification**

## **Train on Synthetic Only**

In [62]:
train_df = gen_df
test_df = orig_df

X_train = train_df['sentences']
X_test = test_df['sentences']

y_train = train_df['labels']
y_test = test_df['labels']

In [63]:
X_train = np.array(sentence_transformer.encode(X_train.to_list()))
X_test = np.array(sentence_transformer.encode(X_test.to_list()))

In [64]:
model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

Train acc: 0.991
Test acc: 0.297


In [65]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.01455604075691412
0.2777777777777778


In [66]:
y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.19412747810050715


# **Train on Synthetic + 100 Real**

In [67]:
gen_df['type'] = 'synthetic'
orig_df['type'] = 'original'

In [68]:
orig_sample_df = pd.read_csv('../data/initial_datasets/dota2_train_labels_translated_sanitized.csv')
orig_sample_df = orig_sample_df.drop(orig_df.index)
orig_sample_df = orig_sample_df.rename(columns={'translated_message': 'sentences', 'label': 'labels'})
orig_sample_df['labels'] = orig_sample_df['labels'].fillna(0)
orig_sample_df['labels'] = orig_sample_df['labels'].replace({'x': 1})
orig_sample_df['labels'] = orig_sample_df['labels'].astype(int)
orig_sample_df = orig_sample_df[['sentences', 'labels']]

  orig_sample_df['labels'] = orig_sample_df['labels'].replace({'x': 1})


In [69]:
orig_sample_df['type'] = 'original'

Unnamed: 0,sentences,labels,type
1000,"""you will""",0,original
1001,) you won't ),0,original
1002,"""Bristle, I fucked your family in the ass.""",0,original
1003,"""Bristle, your mom is under my table sucking.""",0,original
1004,gg,0,original
...,...,...,...
3454,"""my Luna hits the one with leggings, now I see...",0,original
3455,GG,0,original
3456,"""my shadow doesn't know how to use ult""",0,original
3457,GG,0,original


In [70]:
gen_df

Unnamed: 0,sentences,labels,type
0,I'm reporting u aftr game,0,synthetic
1,"nice play team, wp!",1,synthetic
2,wtf r u doin?,0,synthetic
3,"keep it up, ur doing gr8!",1,synthetic
4,lol uninstall pls,0,synthetic
...,...,...,...
995,"let's push mid, we got this",1,synthetic
996,"u can't last hit, uninstall",0,synthetic
997,nice save back there,1,synthetic
998,"stop throwing, wtf",0,synthetic


In [74]:
train_df = pd.concat([gen_df.sample(n=900), orig_sample_df.sample(n=100)]).sample(frac=1).reset_index(drop=True)
test_df = orig_df.sample(frac=1).reset_index(drop=True)

X_train = train_df['sentences']
X_test = test_df['sentences']

y_train = train_df['labels']
y_test = test_df['labels']

In [76]:
train_df['type'].value_counts()

type
synthetic    900
original     100
Name: count, dtype: int64

In [77]:
X_train = np.array(sentence_transformer.encode(X_train.to_list()))
X_test = np.array(sentence_transformer.encode(X_test.to_list()))

In [78]:
model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

Train acc: 0.981
Test acc: 0.891


In [81]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.0
0.0


In [82]:
y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.22697671738128167


# **Wasserstein**

In [28]:
import numpy as np
import ot

def wasserstein(X, Y):
    
    a = np.ones((X.shape[0],)) / X.shape[0]
    b = np.ones((Y.shape[0],)) / Y.shape[0]

    M = ot.dist(X, Y)
    M /= M.max()

    return ot.emd2(a, b, M)

In [5]:
gen_df

Unnamed: 0,sentences,labels
0,gg ez noobs learn to play,0
1,"nice play team, keep it up!",1
2,fucking idiot stop feeding,0
3,"good fight, let's keep pushing",1
4,report this faggot feeder,0
...,...,...
995,stop feeding you retard,0
996,"gg wp guys, let's queue again",1
997,bunch of noob bitches,0
998,"easy game, good teamwork",1


In [None]:
orig_d

Unnamed: 0,message,label,player,special_token_message,match,translated_message,sanitized_translated_message
986,mk,,2,[Team0][Player2]: mk,33,mk,mk
2898,trip,,4,[Team0][Player4]: trip,64,trip,trip
3315,))),,9,[Team1][Player4]: ))),82,))),)))
1720,QUE PEGA ISSO,,5,[Team1][Player0]: QUE PEGA ISSO,42,"""WHAT'S UP WITH THAT""",WHAT'S UP WITH THAT
552,вот и думай мусорка на морте,,2,[Team0][Player2]: вот и думай мусорка на морте,30,"""there you go, think about it, trash on Mortred""","there you go, think about it, **** on Mortred"
...,...,...,...,...,...,...,...
78,x3 gem,,9,[Team1][Player4]: x3 gem,13,"""x3 gem""",x3 gem
1680,gg,,5,[Team1][Player0]: gg,84,gg,gg
2369,gg,,2,[Team0][Player2]: gg,80,gg,gg
1442,i also kill,,1,[Team0][Player1]: i also kill,43,"""i also kill""",i also kill


In [26]:
X = np.array(sentence_transformer.encode(orig_df['translated_message'].to_list()))
Y = np.array(sentence_transformer.encode(gen_df['sentences'].to_list()))

In [29]:
wasserstein(X, Y)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0.5516760837584738

# **Classifier**

In [49]:
orig_df

Unnamed: 0,sentences,labels,type
3094,"""come on""",0,original
1046,RAMPAGE,0,original
214,"""fucking lag""",0,original
1971,"""your mom, idiot?""",0,original
1973,"""nothing""",0,original
...,...,...,...
179,snipshit,0,original
603,Gg,0,original
294,game,0,original
2813,"""well at least""",0,original


In [50]:
gen_df

Unnamed: 0,sentences,labels,type
0,gg ez noobs learn to play,0,synthetic
1,"nice play team, keep it up!",1,synthetic
2,fucking idiot stop feeding,0,synthetic
3,"good fight, let's keep pushing",1,synthetic
4,report this faggot feeder,0,synthetic
...,...,...,...
995,stop feeding you retard,0,synthetic
996,"gg wp guys, let's queue again",1,synthetic
997,bunch of noob bitches,0,synthetic
998,"easy game, good teamwork",1,synthetic
