In [1]:
import sys
sys.path.append('../utilities/')
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import re
from sklearn.feature_extraction.text import CountVectorizer



In [2]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **Data Pre-processing**

In [3]:
df = pd.read_csv('../data/initial_datasets/dota2_train_labels_translated_sanitized.csv')

In [4]:
df['label'] = df['label'].replace({'x': 1})
df['label'] = df['label'].fillna(0)
df['label'] = df['label'].astype(int)

  df['label'] = df['label'].replace({'x': 1})


In [7]:
test_df = df[['translated_message', 'label']].sample(n=1000)

In [10]:
train_df = df.drop(test_df.index)

In [17]:
train_df = train_df[['translated_message', 'label']]

In [22]:
df['label'].value_counts() / df.shape[0]

label
0    0.959815
1    0.040185
Name: count, dtype: float64

# **Classifier**

In [54]:
n = [10, 50, 100, 200]

In [56]:
for ni in n:
    train_sample = train_df.sample(n=ni)
    X_train = np.array(sentence_transformer.encode(train_sample['translated_message'].to_list()))
    y_train = train_sample['label']

    X_test = np.array(sentence_transformer.encode(test_df['translated_message'].to_list()))
    y_test = test_df['label']

    model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)

    print(ni)
    print(f"Test Acc: {test_acc}")

    y_prob = model.predict_proba(X_test)[:, 1]
    print(f"Roc Auc: {roc_auc_score(y_test, y_prob)}")

    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test, y_pred)}")

10
Test Acc: 0.959
Roc Auc: 0.37427962194559705
Precision: 0.0
Recall: 0.0
50
Test Acc: 0.964
Roc Auc: 0.5945424158598434
Precision: 0.0
Recall: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


100
Test Acc: 0.955
Roc Auc: 0.6895458736745044
Precision: 0.2
Recall: 0.08333333333333333
200
Test Acc: 0.954
Roc Auc: 0.8312586445366529
Precision: 0.32142857142857145
Recall: 0.25


In [48]:
train_sample = train_df.sample(n=n)

In [49]:
train_sample['label'].value_counts()

label
0    185
1     15
Name: count, dtype: int64

In [53]:
X_train = np.array(sentence_transformer.encode(train_sample['translated_message'].to_list()))
y_train = train_sample['label']

KeyboardInterrupt: 

In [51]:
X_test = np.array(sentence_transformer.encode(test_df['translated_message'].to_list()))
y_test = test_df['label']

In [52]:
model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print(f"Test Acc: {test_acc}")

print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

Test Acc: 0.933
0.21818181818181817
0.3333333333333333
0.8908050945136007
