In [1]:
import sys
sys.path.append('../utilities/')
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import re
from sklearn.feature_extraction.text import CountVectorizer



# **Transformer**

In [2]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **Data Pre-processing**

In [3]:
train_df = pd.read_csv('../data/initial_datasets/dota2/dota2_train.csv')
test_df = pd.read_csv('../data/initial_datasets/dota2/dota2_test.csv')

In [4]:
train_df

Unnamed: 0,translated_message,label
0,"""faggot""",0
1,"""there's a monkey""",0
2,"""Sniper, how do you like our mid player?""",0
3,"""You're all corpses, guys""",0
4,"""Not bad""",0
...,...,...
2454,"""my Luna hits the one with leggings, now I see...",0
2455,GG,0
2456,"""my shadow doesn't know how to use ult""",0
2457,GG,0


In [5]:
test_df

Unnamed: 0,translated_message,label
0,?,0
1,"""is this solo?""",0
2,lol,0
3,"""please report that kunka please""",0
4,"""dauhdasduhas""",0
...,...,...
995,so close,0
996,"""I had ping""",0
997,"""sf well done""",0
998,x,0


# **Classifier**

In [7]:
train_df = train_df.sample(n=1000)

In [8]:
X_train = np.array(sentence_transformer.encode(train_df['translated_message'].to_list()))
X_test = np.array(sentence_transformer.encode(test_df['translated_message'].to_list()))

In [9]:
y_train = train_df['label']
y_test = test_df['label']

In [12]:
model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

Train acc: 0.941
Test acc: 0.916


In [13]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.24468085106382978
0.6388888888888888


In [14]:
y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.9014522821576763
