In [2]:
import sys
sys.path.append('../utilities/')
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import re
from sklearn.feature_extraction.text import CountVectorizer



# **Transformer**

In [3]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **Data Pre-processing**

In [12]:
gen_df = pd.read_csv('../data/generated/reddit/control/initial_gen.csv')
orig_df = pd.read_csv('../data/initial_datasets/reddit_test.csv')

In [13]:
gen_df

Unnamed: 0,sentences,labels
0,Finally finished that game last night! So epic! 🎮,1
1,"Ugh, stuck in traffic AGAIN. Feels like I'm wa...",-1
2,[NAME]'s new album dropped today and it's fire...,1
3,Why is this app so glitchy? It's driving me nu...,-1
4,"Got a surprise gift from [NAME], totally made ...",1
...,...,...
995,"Honestly, this movie was a huge letdown 🤦‍♂️",-1
996,"Found the cutest puppy today, made my year! 🐶",1
997,Service at this restaurant was awful 👎,-1
998,This book is an absolute masterpiece 📚,1


In [14]:
orig_df

Unnamed: 0,text,label
0,First is the worst,-1
1,Our education system has been a complete and u...,-1
2,"The fuck you call me!? A cunt!? Damn man, didn...",-1
3,It will probably take him some time to figure ...,1
4,Somebody is really insecure about their career...,-1
...,...,...
995,Be glad you don't know the answer.,1
996,I’m laughing more that I feel I should st this...,1
997,I wouldnt necessarily call you and addict but ...,-1
998,I'm genuinely interested in the responses to t...,1


In [15]:
orig_df.rename(columns={'text': 'sentences', 'label': 'labels'}, inplace=True)

# **MMD**

In [16]:
X = np.array(sentence_transformer.encode(orig_df['sentences'].to_list()))
Y = np.array(sentence_transformer.encode(gen_df['sentences'].to_list()))

In [17]:
def report_MMD(X, Y, normalize=False):
    
    tensorX = torch.tensor(X)
    tensorY = torch.tensor(Y)
    rbf_mmd = MMD(tensorX, tensorY, "rbf")
    scale_mmd = MMD(tensorX, tensorY, "multiscale")


    if normalize:
        return (rbf_mmd.item() / np.sqrt((1.0 / X.shape[0]) + (1.0 / Y.shape[0]))), (scale_mmd.item() / np.sqrt(1.0 / X.shape[0] + 1.0 / Y.shape[0]))
    else:
        return rbf_mmd.item(), scale_mmd.item()

In [18]:
X.shape

(1000, 768)

In [19]:
report_MMD(X, Y, normalize=True)

(0.878304784672583, 3.2406348060106684)

# **Wasserstein**

In [20]:
import numpy as np
import ot

def wasserstein(X, Y):
    
    a = np.ones((X.shape[0],)) / X.shape[0]
    b = np.ones((Y.shape[0],)) / Y.shape[0]

    M = ot.dist(X, Y)
    M /= M.max()

    return ot.emd2(a, b, M)

In [21]:
wasserstein(X, Y)

Python(94782) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0.5666999897956844

# **Synthetic Classifier**

In [22]:
train_df = gen_df
test_df = orig_df

X_train = train_df['sentences']
X_test = test_df['sentences']

y_train = train_df['labels']
y_test = test_df['labels']

In [23]:
train_df

Unnamed: 0,sentences,labels
0,Finally finished that game last night! So epic! 🎮,1
1,"Ugh, stuck in traffic AGAIN. Feels like I'm wa...",-1
2,[NAME]'s new album dropped today and it's fire...,1
3,Why is this app so glitchy? It's driving me nu...,-1
4,"Got a surprise gift from [NAME], totally made ...",1
...,...,...
995,"Honestly, this movie was a huge letdown 🤦‍♂️",-1
996,"Found the cutest puppy today, made my year! 🐶",1
997,Service at this restaurant was awful 👎,-1
998,This book is an absolute masterpiece 📚,1
