In [1]:
import sys
sys.path.append('../utilities/')
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import re
from sklearn.feature_extraction.text import CountVectorizer



# **Transformer**

In [2]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **Data Pre-processing**

In [3]:
train_df = pd.read_csv('../data/initial_datasets/reddit/reddit_train.csv')
test_df = pd.read_csv('../data/initial_datasets/reddit/reddit_test.csv')

In [4]:
train_df

Unnamed: 0,text,label
0,That game hurt.,-1
1,Man I love reddit.,1
2,Right? Considering it’s such an important docu...,1
3,"He isn't as big, but he's still quite popular....",-1
4,That's crazy; I went to a super [RELIGION] hig...,1
...,...,...
40696,Oh man is this true. Coffee has a seriously ne...,-1
40697,"You’re good, no worries",1
40698,"one's a rapist, and the other's a stingy yank ...",-1
40699,This is great! Can anyone make a request with ...,1


In [5]:
test_df

Unnamed: 0,text,label
0,First is the worst,-1
1,Our education system has been a complete and u...,-1
2,"The fuck you call me!? A cunt!? Damn man, didn...",-1
3,It will probably take him some time to figure ...,1
4,Somebody is really insecure about their career...,-1
...,...,...
995,Be glad you don't know the answer.,1
996,I’m laughing more that I feel I should st this...,1
997,I wouldnt necessarily call you and addict but ...,-1
998,I'm genuinely interested in the responses to t...,1


# **Classifier**

In [6]:
train_df = train_df.sample(n=1000)

In [8]:
X_train = np.array(sentence_transformer.encode(train_df['text'].to_list()))
X_test = np.array(sentence_transformer.encode(test_df['text'].to_list()))

In [9]:
y_train = train_df['label']
y_test = test_df['label']

In [10]:
model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

Train acc: 0.843
Test acc: 0.736


In [11]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.8429906542056075
0.7147385103011094


In [None]:
y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.8264616322866873
