In [2]:
import sys
sys.path.append('../utilities/')
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import re
from sklearn.feature_extraction.text import CountVectorizer



In [3]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **Data Pre-processing**

In [4]:
df = pd.read_csv('../data/initial_datasets/reddit_binary_labels.csv')

In [7]:
test_df = df.sample(n=1000)

In [8]:
test_df.to_csv('../data/initial_datasets/reddit_test.csv', index=False)

In [9]:
train_df = df.drop(test_df.index)

In [12]:
train_df.to_csv('../data/initial_datasets/reddit_train.csv', index=False)

In [13]:
df['label'].value_counts() / df.shape[0]

label
 1    0.616556
-1    0.383444
Name: count, dtype: float64

# **Classifier**

In [39]:
n = [10, 50, 100, 200]

In [40]:
for ni in n:
    train_sample = train_df.sample(n=ni)
    X_train = np.array(sentence_transformer.encode(train_sample['text'].to_list()))
    y_train = train_sample['label']

    X_test = np.array(sentence_transformer.encode(test_df['text'].to_list()))
    y_test = test_df['label']

    model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)

    print(ni)
    print(f"Test Acc: {test_acc}")

    y_prob = model.predict_proba(X_test)[:, 1]
    print(f"Roc Auc: {roc_auc_score(y_test, y_prob)}")

    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test, y_pred)}")

10
Test Acc: 0.593
Roc Auc: 0.35859327689948844
Precision: 0.6830065359477124
Recall: 0.6624405705229794
50
Test Acc: 0.697
Roc Auc: 0.770938287829788
Precision: 0.811787072243346
Recall: 0.6767036450079239
100
Test Acc: 0.682
Roc Auc: 0.7638776150043592
Precision: 0.8050682261208577
Recall: 0.6545166402535658
200
Test Acc: 0.693
Roc Auc: 0.7658145757368826
Precision: 0.78125
Recall: 0.7131537242472267


In [34]:
train_sample = train_df.sample(n=n)

In [35]:
train_sample['label'].value_counts()

label
 1    124
-1     76
Name: count, dtype: int64

In [36]:
X_train = np.array(sentence_transformer.encode(train_sample['text'].to_list()))
y_train = train_sample['label']

In [37]:
X_test = np.array(sentence_transformer.encode(test_df['text'].to_list()))
y_test = test_df['label']

In [None]:
model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print(f"Test Acc: {test_acc}")

y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

Test Acc: 0.68
0.8031189083820662
0.652931854199683
0.7522880617078754
