In [1]:
import sys
sys.path.append('../utilities/')

In [2]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import torch
import json



In [3]:
df = pd.read_csv('../data/initial_datasets/reddit_binary_labels.csv')

In [4]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

In [6]:
df

Unnamed: 0,text,label
0,That game hurt.,-1
1,Man I love reddit.,1
2,Right? Considering it’s such an important docu...,1
3,"He isn't as big, but he's still quite popular....",-1
4,That's crazy; I went to a super [RELIGION] hig...,1
...,...,...
41696,"You’re good, no worries",1
41697,"one's a rapist, and the other's a stingy yank ...",-1
41698,Ugh i wish i had seen this earlier! Have fun a...,1
41699,This is great! Can anyone make a request with ...,1


In [81]:
y.shape

(41701,)

In [80]:
X.shape

(41701, 768)

In [5]:
sample_df = df.sample(n=1000)

In [8]:
transformer = SentenceTransformer('all-mpnet-base-v2')
X = np.array(transformer.encode(sample_df['text'].to_list()))
y = np.array(sample_df['label'].to_list())

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [16]:
model = svm.SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

Train acc: 0.88
Test acc: 0.7366666666666667


In [11]:
param_grid = {
    'C': [10**(i) for i in range(-2, 3)],
    'kernel': ['linear', 'poly', 'rbf'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4],
}

In [19]:
model = svm.SVC(probability=True)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, verbose=0, scoring='accuracy', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

0.7685714285714286
{'C': 1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}


In [None]:
y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

In [25]:
best_model = grid_search.best_estimator_

In [26]:
y_pred = best_model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = best_model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

Train acc: 0.88
Test acc: 0.7366666666666667


In [17]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.7540106951871658
0.8103448275862069


In [28]:
y_prob = best_model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.8294563035942346


# **Control Classifier**

In [55]:
gen_df = pd.read_csv('../data/generated/reddit/privacy_protected/new_reddit_agg_stats_gen.csv')
orig_df = pd.read_csv('../data/initial_datasets/reddit_test.csv')
orig_df = orig_df.sample(n=1000)

In [7]:
orig_df

Unnamed: 0,text,label
750,An especially large comfortable shirty [NAME] ...,-1
733,State of [NAME] trying to be funny and spellin...,1
151,You can still joke but people will react negat...,1
773,"I will, thank you!",1
923,Look at her all grown up 😍beautiful.,1
...,...,...
207,no worries! expense double date ;p I hope you ...,1
187,Me too I hope all goes well for both of us,1
770,I like him too but when we DFAd him last year ...,1
782,It's one of my most hoped-for things. I think ...,1


In [56]:
gen_df['type'] = 'synthetic'
orig_df['type'] = 'original'

In [57]:
orig_df.rename(columns={'text': 'sentences', 'label': 'labels'}, inplace=True)

## **Only Synth Classifier**

In [58]:
train_df = gen_df
test_df = orig_df

X_train = train_df['sentences']
y_train = train_df['labels']

X_test = test_df['sentences']
y_test = test_df['labels']

In [59]:
train_df['type'].value_counts()

type
synthetic    1000
Name: count, dtype: int64

In [60]:
test_df['type'].value_counts()

type
original    1000
Name: count, dtype: int64

In [61]:
X_train = np.array(sentence_transformer.encode(X_train.to_list()))
X_test = np.array(sentence_transformer.encode(X_test.to_list()))

In [62]:
model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

Train acc: 1.0
Test acc: 0.677


In [63]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.868421052631579
0.5752773375594294


In [64]:
y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.8074055463217074


In [53]:
report_MMD(X_train, X_test)

(0.031891822814941406, 0.11844038963317871)

## **Only Synth + 100 Samples**

In [17]:
orig_df

Unnamed: 0,sentences,labels,type
750,An especially large comfortable shirty [NAME] ...,-1,original
733,State of [NAME] trying to be funny and spellin...,1,original
151,You can still joke but people will react negat...,1,original
773,"I will, thank you!",1,original
923,Look at her all grown up 😍beautiful.,1,original
...,...,...,...
207,no worries! expense double date ;p I hope you ...,1,original
187,Me too I hope all goes well for both of us,1,original
770,I like him too but when we DFAd him last year ...,1,original
782,It's one of my most hoped-for things. I think ...,1,original


In [65]:
orig_train_df = pd.read_csv('../data/initial_datasets/reddit_train.csv')
orig_train_df['type'] = 'original'
orig_train_df = orig_train_df.sample(n=100)

In [66]:
orig_train_df.rename(columns={'text': 'sentences', 'label': 'labels'}, inplace=True)

In [18]:
gen_df

Unnamed: 0,sentences,labels,type
0,Finally finished that game last night! So epic! 🎮,1,synthetic
1,"Ugh, stuck in traffic AGAIN. Feels like I'm wa...",-1,synthetic
2,[NAME]'s new album dropped today and it's fire...,1,synthetic
3,Why is this app so glitchy? It's driving me nu...,-1,synthetic
4,"Got a surprise gift from [NAME], totally made ...",1,synthetic
...,...,...,...
995,"Honestly, this movie was a huge letdown 🤦‍♂️",-1,synthetic
996,"Found the cutest puppy today, made my year! 🐶",1,synthetic
997,Service at this restaurant was awful 👎,-1,synthetic
998,This book is an absolute masterpiece 📚,1,synthetic


In [67]:
gen_df['type'] = 'synthetic'
orig_df['type'] = 'original'

In [None]:
orig_sample = orig_df.sample(n=1000)

In [68]:
train_df = pd.concat([gen_df.sample(n=900), orig_train_df]).sample(frac=1).reset_index(drop=True)
test_df = orig_df.sample(frac=1).reset_index(drop=True)

X_train = train_df['sentences']
X_test = test_df['sentences']

y_train = train_df['labels']
y_test = test_df['labels']

In [69]:
train_df['type'].value_counts()

type
synthetic    900
original     100
Name: count, dtype: int64

In [70]:
test_df['type'].value_counts()

type
original    1000
Name: count, dtype: int64

In [28]:
test_df['labels'].value_counts()

labels
 1    631
-1    369
Name: count, dtype: int64

In [None]:
X_train.shuffle()

264      Just had the best sushi ever, highly recommend! 🍣
679              Woke up late and missed my train, again 😒
640            Just got into biking and it's amazing! 🚴‍♂️
69              These bugs in the game are driving me nuts
589            What a waste of time, totally disappointed.
                               ...                        
13854    I think my worry would be if another dog attac...
2988                              Now I can’t unhear it...
4012     She went from normal mom to ~~[NAME] amazed by...
29357    I started this morning barging seven minutes l...
9230     A complete lack of self-reflection is a prereq...
Name: sentences, Length: 1000, dtype: object

In [71]:
print(X_train.apply(type).value_counts())

sentences
<class 'str'>    1000
Name: count, dtype: int64


In [72]:
X_train = np.array(sentence_transformer.encode(X_train.to_list()))
X_test = np.array(sentence_transformer.encode(X_test.to_list()))

In [73]:
model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

Train acc: 0.994
Test acc: 0.715


In [74]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.8134057971014492
0.7115689381933439


In [75]:
y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.7886500972775179


# **MMD**

In [51]:
def report_MMD(X, Y, normalize=False):
    
    tensorX = torch.tensor(X)
    tensorY = torch.tensor(Y)
    rbf_mmd = MMD(tensorX, tensorY, "rbf")
    scale_mmd = MMD(tensorX, tensorY, "multiscale")


    if normalize:
        return (rbf_mmd.item() / np.sqrt((1.0 / X.shape[0]) + (1.0 / Y.shape[0]))), (scale_mmd.item() / np.sqrt(1.0 / X.shape[0] + 1.0 / Y.shape[0]))
    else:
        return rbf_mmd.item(), scale_mmd.item()

In [19]:
orig_df

Unnamed: 0,sentences,labels
6729,The way that ladder is attached to the buildin...,-1
33148,The left solution is to make them all citizens...,-1
25660,Me too! They were half off at the grocery stor...,1
11460,She forced him to sing a valentine song by thr...,1
5842,It is actually out on the steam store right now.,1
...,...,...
34084,What absolute garbage,-1
28297,"Reminds me of that last scene in Coraline, whe...",1
804,"""If you don't support no go zones, you are rac...",-1
27159,"That is absurd, man. I'm sorry you don't belie...",-1


In [20]:
sample_df

Unnamed: 0,text,label
13205,I used it for a while. It made me way more soc...,1
23193,Now this is a Contrapoints video I want to see,1
33371,May [NAME] bless and keep the President... far...,1
38983,The main reason I left is a hidden mmr used to...,-1
29165,I'm glad you could relate and that you also fo...,1
...,...,...
22908,You can. I've used my Kroger-linked phone numb...,1
27778,I liked it better when [NAME] was half naked,-1
9024,[NAME] game sucks. He's good at comps but his ...,1
22259,>[NAME] One of the most horrifying stories I'v...,1


In [9]:
orig_dist = np.array(sentence_transformer.encode(orig_df['sentences'].to_list()))
sample_dist = np.array(sentence_transformer.encode(gen_df['sentences'].to_list()))

In [52]:
report_MMD(orig_dist, sample_dist, normalize=True)

NameError: name 'orig_dist' is not defined

In [12]:
gen_df = pd.read_csv('../data/generated/reddit/privacy_protected/new_reddit_agg_stats_gen.csv')
orig_df = pd.read_csv('../data/initial_datasets/reddit_binary_labels.csv')
orig_df = orig_df.sample(n=1000)

In [13]:
orig_df.rename(columns={'text': 'sentences', 'label': 'labels'}, inplace=True)

In [14]:
orig_dist = np.array(sentence_transformer.encode(orig_df['sentences'].to_list()))
sample_dist = np.array(sentence_transformer.encode(gen_df['sentences'].to_list()))

In [10]:
import numpy as np
import ot

def wasserstein(X, Y):
    
    a = np.ones((X.shape[0],)) / X.shape[0]
    b = np.ones((Y.shape[0],)) / Y.shape[0]

    M = ot.dist(X, Y)
    M /= M.max()

    return ot.emd2(a, b, M)

In [15]:
wasserstein(orig_dist, sample_dist)

0.5408510427325971