In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import sys
sys.path.append('../utilities/')

In [3]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import torch
import re

# **Sentence Transfromer**

In [25]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **Reading Data**

In [5]:
df = pd.read_csv('../data/generated/reddit/privacy_protected/reddit_agg_stats_gen.csv')

In [13]:
df

Unnamed: 0,sentences,labels
0,"I'm so happy for [NAME], they really deserve t...",1
1,Had the worst experience at the restaurant yes...,-1
2,Can't believe how awesome the concert was last...,1
3,"[NAME] really messed up this time, can't trust...",-1
4,"Just finished the book you recommended, it was...",1
...,...,...
495,Can't believe how bad the food was at that pla...,-1
496,"Got to see my favorite band live, still buzzing!",1
497,"I've had enough of the service there, it's alw...",-1
498,"Just finished the series and OMG, loved every ...",1


In [14]:
gen_df = df

In [99]:
orig_df = pd.read_csv('../data/initial_datasets/reddit_binary_labels.csv')
orig_df = orig_df.sample(n=1000)
orig_df.reset_index(drop=True, inplace=True)
orig_df.rename(columns={'text': 'sentences', 'label': 'labels'}, inplace=True)

# **Classification**

## **Preparing Only Synthetic Model**

In [18]:
gen_df

Unnamed: 0,sentences,labels
0,"I'm so happy for [NAME], they really deserve t...",1
1,Had the worst experience at the restaurant yes...,-1
2,Can't believe how awesome the concert was last...,1
3,"[NAME] really messed up this time, can't trust...",-1
4,"Just finished the book you recommended, it was...",1
...,...,...
495,Can't believe how bad the food was at that pla...,-1
496,"Got to see my favorite band live, still buzzing!",1
497,"I've had enough of the service there, it's alw...",-1
498,"Just finished the series and OMG, loved every ...",1


In [23]:
orig_df

Unnamed: 0,text,label
0,"Uhhhh, existed... Ever been to Arkansas? It’s ...",-1
1,Honestly rams are my second NFC team. I wanted...,1
2,Ah. So cardboard. Haha,1
3,Oh to find someone who loves you enough to put...,1
4,I can’t wait,-1
...,...,...
995,"You have too many knives, I don't trust you. S...",-1
996,Be prepared for many old men to hit on you,-1
997,Stay strong.,1
998,"You know, girls really like it when you make t...",1


In [24]:
test_df = orig_df.sample(n=300)
train_df = gen_df.sample(n=700)

X_train = gen_df['sentences']
X_test = orig_df['text']

y_train = gen_df['labels']
y_test = orig_df['label']

In [28]:
X_train = np.array(sentence_transformer.encode(X_train.to_list()))
X_test = np.array(sentence_transformer.encode(X_test.to_list()))

## **Preparing Synthetic+100 Samples Model**

In [45]:
orig_df

Unnamed: 0,text,label
0,"Uhhhh, existed... Ever been to Arkansas? It’s ...",-1
1,Honestly rams are my second NFC team. I wanted...,1
2,Ah. So cardboard. Haha,1
3,Oh to find someone who loves you enough to put...,1
4,I can’t wait,-1
...,...,...
995,"You have too many knives, I don't trust you. S...",-1
996,Be prepared for many old men to hit on you,-1
997,Stay strong.,1
998,"You know, girls really like it when you make t...",1


In [46]:
gen_df

Unnamed: 0,sentences,labels
0,"I'm so happy for [NAME], they really deserve t...",1
1,Had the worst experience at the restaurant yes...,-1
2,Can't believe how awesome the concert was last...,1
3,"[NAME] really messed up this time, can't trust...",-1
4,"Just finished the book you recommended, it was...",1
...,...,...
495,Can't believe how bad the food was at that pla...,-1
496,"Got to see my favorite band live, still buzzing!",1
497,"I've had enough of the service there, it's alw...",-1
498,"Just finished the series and OMG, loved every ...",1


In [57]:
orig_sample = orig_df.sample(n=400)
orig_train = orig_sample.sample(n=100)
orig_test = orig_sample.drop(orig_train.index)

In [58]:
orig_train.reset_index(drop=True, inplace=True)
orig_test.reset_index(drop=True, inplace=True)

In [68]:
gen_train

Unnamed: 0,sentences,labels
0,Finally got to try that recipe and it turned o...,1
1,Had a terrible experience with the customer se...,-1
2,Why do they always mess up my order? Frustrati...,-1
3,"Ugh, my internet is down again. So frustrating!",-1
4,"Just finished the book, and I love it! Highly ...",1
...,...,...
595,"Not going back to that restaurant, food was aw...",-1
596,"This was really disappointing, expected more tbh",-1
597,I had the best time at the concert last night! 🎶,1
598,Had the best pizza ever last night! 🍕,1


In [69]:
gen_train = gen_df.sample(n=600).reset_index(drop=True)


In [75]:
orig_train.rename(columns={'text': 'sentences', 'label': 'labels'}, inplace=True)

In [76]:
orig_test.rename(columns={'text': 'sentences', 'label': 'labels'}, inplace=True)

In [79]:
train_df = pd.concat([gen_train, orig_train], axis=0)
test_df = orig_test

In [81]:
X_train = train_df['sentences']
X_test = test_df['sentences']

y_train = train_df['labels']
y_test = test_df['labels']

In [84]:
X_train = np.array(sentence_transformer.encode(X_train.to_list()))
X_test = np.array(sentence_transformer.encode(X_test.to_list()))

## **Model Training**

In [93]:
model = svm.SVC(C=1, degree=2, gamma='scale', kernel='linear', probability=True)

In [94]:
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

Train acc: 0.9857142857142858
Test acc: 0.7266666666666667


In [95]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.8451612903225807
0.6931216931216931


In [96]:
y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.8033748033748034


In [89]:
param_grid = {
    'C': [10**(i) for i in range(-2, 3)],
    'kernel': ['linear', 'poly', 'rbf'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4],
}

In [90]:
model = svm.SVC()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, verbose=1, scoring='accuracy', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

0.9557142857142857
{'C': 1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}


In [92]:
print(accuracy_score(grid_search.best_estimator_.predict(X_test), y_test))

0.7266666666666667


# **MMD Comparison**

In [97]:
gen_df

Unnamed: 0,sentences,labels
0,"I'm so happy for [NAME], they really deserve t...",1
1,Had the worst experience at the restaurant yes...,-1
2,Can't believe how awesome the concert was last...,1
3,"[NAME] really messed up this time, can't trust...",-1
4,"Just finished the book you recommended, it was...",1
...,...,...
495,Can't believe how bad the food was at that pla...,-1
496,"Got to see my favorite band live, still buzzing!",1
497,"I've had enough of the service there, it's alw...",-1
498,"Just finished the series and OMG, loved every ...",1


In [100]:
orig_df

Unnamed: 0,sentences,labels
0,That's really sad. I used to live not far from...,-1
1,Bless your heart.,1
2,I actually don’t mind Sudbury to Nipigon but w...,-1
3,Pfft. That's awful of him and her. They deserv...,-1
4,Came here to say this,1
...,...,...
995,That sounds really cool! I'll be there,1
996,I see you're a man of culture as well. But rea...,1
997,So bad its funny,1
998,"Lol I don’t disagree, but the perimeter defens...",1


In [106]:
gen_embeddings = np.array(sentence_transformer.encode(gen_df['sentences']))
orig_embeddings = np.array(sentence_transformer.encode(orig_df['sentences']))

In [108]:
def report_MMD(X, Y, normalize=False):
    
    tensorX = torch.tensor(X)
    tensorY = torch.tensor(Y)
    rbf_mmd = MMD(tensorX, tensorY, "rbf")
    scale_mmd = MMD(tensorX, tensorY, "multiscale")


    if normalize:
        return (rbf_mmd.item() / np.sqrt((1.0 / X.shape[0]) + (1.0 / Y.shape[0]))), (scale_mmd.item() / np.sqrt(1.0 / X.shape[0] + 1.0 / Y.shape[0]))
    else:
        return rbf_mmd.item(), scale_mmd.item()

In [110]:
report_MMD(gen_embeddings, orig_embeddings, normalize=True)

(0.5528775827278929, 2.071449005111452)