In [6]:
import sys
sys.path.append('../utilities/')
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import re
from sklearn.feature_extraction.text import CountVectorizer

# **Transformer**

In [3]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **Data Pre-processing**

In [9]:
df = pd.read_csv('../data/initial_datasets/dota2_train_labels_translated_sanitized.csv')

In [10]:
df['label'] = df['label'].replace({'x': 1})
df['label'] = df['label'].fillna(0)
df['label'] = df['label'].astype(int)

  df['label'] = df['label'].replace({'x': 1})


In [12]:
df['label'].value_counts()

label
0    3320
1     139
Name: count, dtype: int64

In [6]:
group, data = next(iter(df.groupby('match')))

In [8]:
data

Unnamed: 0,message,label,player,special_token_message,match,translated_message,sanitized_translated_message
786,я уверен,0,2,[Team0][Player2]: я уверен,0,"""I'm sure""",I'm sure
787,что тут вард,0,2,[Team0][Player2]: что тут вард,0,"""that there's a ward here""",that there's a ward here
788,я прав?,0,2,[Team0][Player2]: я прав?,0,"""Am I right?""",Am I right?
789,на хг есть вард?,0,2,[Team0][Player2]: на хг есть вард?,0,"""Is there a ward on the high ground?""",Is there a ward on the high ground?
790,ставь сентри,0,7,[Team1][Player2]: ставь сентри,0,"""Place a sentry.""",Place a sentry.
791,"когда против тебя пудж, тут всегда вард",0,2,"[Team0][Player2]: когда против тебя пудж, тут ...",0,"""When you're up against Pudge, there's always ...","When you're up against Pudge, there's always a..."
792,узнаешь,0,7,[Team1][Player2]: узнаешь,0,"""you'll find out""",you'll find out
793,или он ближе к руне?,0,2,[Team0][Player2]: или он ближе к руне?,0,"""or is it closer to the rune?""","""or is it closer to the rune?"""
794,hg,0,6,[Team1][Player1]: hg,0,"""high ground""",high ground
795,ставь сентри дружок,0,7,[Team1][Player2]: ставь сентри дружок,0,"""Place a sentry, buddy""","Place a sentry, buddy"


In [34]:
df['group_chat_id'] = df.groupby('match').cumcount()

In [1]:
df

NameError: name 'df' is not defined

In [36]:
df[['translated_message', 'label', 'player', 'group_chat_id']]

Unnamed: 0,translated_message,label,player,group_chat_id
0,"""report invoker""",0,1,0
1,"""Feeding with the courier""",0,1,1
2,"""+ threw mid""",0,1,2
3,Easy mid,0,1,3
4,"""faggot""",0,7,4
...,...,...,...,...
3454,"""my Luna hits the one with leggings, now I see...",0,3,46
3455,GG,0,6,47
3456,"""my shadow doesn't know how to use ult""",0,3,48
3457,GG,0,9,49


# **Model For Just Text and Label**

In [39]:
X_train, X_test, y_train, y_test = train_test_split(df[['translated_message']], df[['label']], test_size=0.3)

In [44]:
X_train = np.array(sentence_transformer.encode(X_train['translated_message'].to_list()))
X_test = np.array(sentence_transformer.encode(X_test['translated_message'].to_list()))

In [48]:
model = svm.SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

  y = column_or_1d(y, warn=True)


Train acc: 0.9653035935563816
Test acc: 0.9653179190751445


In [49]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

1.0
0.1


In [50]:
y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.8583041082164329


# **Model w/ Match-Chat-ID**

In [51]:
np.array(df['group_chat_id'])

array([ 0,  1,  2, ..., 48, 49, 50])

In [52]:
X_train, X_test, y_train, y_test = train_test_split(df[['translated_message', 'group_chat_id']], df['label'], test_size=0.3)

In [55]:
train_messages = np.array(sentence_transformer.encode(X_train['translated_message'].to_list()))

In [65]:
test_messages = np.array(sentence_transformer.encode(X_test['translated_message'].to_list()))

In [66]:
X_train = np.concatenate((train_messages, np.array(X_train['group_chat_id']).reshape(-1, 1)), axis=1)
X_test = np.concatenate((test_messages, np.array(X_test['group_chat_id']).reshape(-1, 1)), axis=1)

In [67]:
model = svm.SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

Train acc: 0.9644774886410574
Test acc: 0.9682080924855492


In [68]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.8333333333333334
0.13513513513513514


In [69]:
y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.8810108810108811


# **Wasserstein**

In [5]:
import numpy as np
import ot

def wasserstein(X, Y):
    
    a = np.ones((X.shape[0],)) / X.shape[0]
    b = np.ones((Y.shape[0],)) / Y.shape[0]

    M = ot.dist(X, Y)
    M /= M.max()

    return ot.emd2(a, b, M)

In [4]:
wasserstein()

Defaulting to user installation because normal site-packages is not writeable
Collecting POT
  Downloading POT-0.9.5-cp39-cp39-macosx_11_0_arm64.whl (344 kB)
[K     |████████████████████████████████| 344 kB 8.8 MB/s eta 0:00:01
Installing collected packages: POT
Successfully installed POT-0.9.5
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
