# Detecting Fraudulent SMS Messages with Federated Learning

## Installing Dependencies

In [None]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install sklearn
!pip install huggingface_hub
!pip install -U imbalanced-learn
!pip install numpy requests nlpaug
!pip install nltk>=3.4.5
!pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece
!pip install flwr[simulation]

## Imports

In [None]:
import torch
import evaluate
import random
import os

import pyarrow as pa
import pandas as pd
import numpy as np
import flwr as fl

from collections import OrderedDict
from datasets import load_dataset, Dataset, DatasetDict
from math import floor
from scipy import sparse
from huggingface_hub import login

from google.colab import drive

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import CondensedNearestNeighbour, RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, DataCollatorForTokenClassification, DefaultDataCollator

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import PassiveAggressiveClassifier

from torch.utils.data import DataLoader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Downloading Dataset

In [None]:
# Downloading original public sms_spam dataset from huggingface and splitting.
# The split sms_spam dataset is what I uploaded as sms_spam in my private hub

# dataset = load_dataset("sms_spam")
# dataset = dataset["train"]
# dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="label")

In [None]:
!rm -rf /root/.cache/

In [None]:
access_token = "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"
dataset = load_dataset("jyoung2247/sms_spam", use_auth_token=access_token)
# dataset = load_dataset("jyoung2247/sms_spam_augmented_synonyms", use_auth_token=access_token)
# #dataset = load_dataset("jyoung2247/sms_spam_augmented_context_insert", use_auth_token=access_token)
# dataset = load_dataset("jyoung2247/sms_spam_augmented_context_substitute", use_auth_token=access_token)
# #dataset = load_dataset("jyoung2247/sms_spam_undersampled_CNN", use_auth_token=access_token)
# dataset = load_dataset("jyoung2247/sms_spam_undersampled_random", use_auth_token=access_token)
# #dataset.push_to_hub("jyoung2247/sms_spam", private=True)
# dataset = load_dataset("jyoung2247/sms_spam_oversampled_SMOTE", use_auth_token=access_token)
# dataset

## Balancing Dataset

### Naive Random Oversampling

In [None]:

#Create numpy arrays from dataset sms messages as x and labels as y
x_train = dataset['train']['sms']
y_train = dataset['train']['label']
x_train = np.array(x_train)
x_train = x_train[:, np.newaxis]

#Create random over sampler from imbalanced-learn
ros = RandomOverSampler(random_state=0)
x_train_resampled, y_train_resampled = ros.fit_resample(x_train, y_train)
x_train_resampled = x_train_resampled.ravel()

#Create a dictionary of resampled x and y
dataset_train = {'sms': x_train_resampled, 'label': y_train_resampled}

#Create a huggingface dataset from the dictionary
dataset_train = Dataset.from_dict(dataset_train)

#Encode the dataset_train label column features as ham and spam
dataset_train = dataset_train.class_encode_column('label')

dataset_train.features['label'].names = ['ham', 'spam']

#Create a DatasetDict from the resampled_dataset, which can be pushed to the huggingface hub
resampled_dataset = DatasetDict()
resampled_dataset['test'] = dataset['test']
resampled_dataset['train'] = dataset_train

### Naive Random Undersampling

In [None]:
#Create numpy arrays from dataset sms messages as x and labels as y
x_train = dataset['train']['sms']
y_train = dataset['train']['label']
x_train = np.array(x_train)
x_train = x_train[:, np.newaxis]


#Create random over sampler from imbalanced-learn

ros = RandomUnderSampler(random_state=0)
x_train_resampled, y_train_resampled = ros.fit_resample(x_train, y_train)
x_train_resampled = x_train_resampled.ravel()

#Create a dictionary of resampled x and y
dataset_train = {'sms': x_train_resampled, 'label': y_train_resampled}
dataset_train = Dataset.from_dict(dataset_train)

#Create a huggingface dataset from the dictionary
dataset_train = dataset_train.class_encode_column('label')

#Encode the dataset_train label column features as ham and spam
dataset_train.features['label'].names = ['ham', 'spam']

#Create a DatasetDict from the resampled_dataset, which can be pushed to the huggingface hub
resampled_dataset = DatasetDict()
resampled_dataset['test'] = dataset['test']
resampled_dataset['train'] = dataset_train

### Undersampled CNN

In [None]:
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

def tokenize_function(examples):
    return tokenizer(examples["sms"], padding="max_length", truncation=True)

tokenized_train = dataset['train'].map(tokenize_function, batched=True)



x_train = tokenized_train['input_ids']
y_train = tokenized_train['label']
x_train = np.array(x_train)

ros = CondensedNearestNeighbour(n_neighbors=1)
x_train_resampled, y_train_resampled = ros.fit_resample(x_train, y_train)
x_train_resampled = tokenizer.batch_decode(x_train_resampled, skip_special_tokens=True)

dataset_train = {'sms': x_train_resampled, 'label': y_train_resampled}
dataset_train = Dataset.from_dict(dataset_train)

print(dataset_train)

dataset_train = dataset_train.class_encode_column('label')

dataset_train.features['label'].names = ['ham', 'spam']

resampled_dataset = DatasetDict()
resampled_dataset['test'] = dataset['test']
resampled_dataset['train'] = dataset_train

In [None]:
#resampled_dataset.push_to_hub("jyoung2247/sms_spam_undersampled_CNN", private=True)

### Augmented Synonyms

In [None]:
# Substitute word by WordNet's synonym

augmented_text = train_spam_sms.copy()

augmented_train_sms = []

#Run 6 iterations of augmenting synonyms to balance the dataset
for i in range(6):
  aug = naw.SynonymAug(aug_src='wordnet')
  augmented_text = aug.augment(augmented_text)
  print(augmented_text)
  augmented_train_sms = augmented_train_sms + augmented_text
  
#Remove the extra 325 generated spam entires to ensure there is an equal amount of spam and non-spam
augmented_train_sms = augmented_train_sms[:-325]
print("Length of augmented spam: ", len(augmented_train_sms))

In [None]:
#Print total spam amount and total ham amount (non-spam)
total_spam_count = len(augmented_train_sms) + len(train_spam_sms)
total_ham_count = len(dataset['train']['sms']) - len(train_spam_sms)
total_sms_count = total_spam_count + total_ham_count
print("Total spam: ", total_spam_count)
print("Total ham: ", total_ham_count)
print("Total sms: ", total_sms_count)

In [None]:
# Set x_train equal to the sms messages in the dataset train split
x_train = dataset['train']['sms']
# Set y_train equal to the labels in the dataset train split
y_train = dataset['train']['label']

# Set augmented x_train equal to x_train + the generated spam
augmented_x_train = x_train + augmented_train_sms

# Set augmented y_train equal to y_train plus a list of 1's of equal size to the generated spam
augmented_y_train = y_train + [1] * (len(augmented_x_train) - len(y_train))

#Combine the augmented x_train and y_train to shuffle together
combined_arr = np.array(augmented_x_train)
combined_arr = combined_arr[:, np.newaxis]
y_arr = np.array(augmented_y_train)
y_arr = y_arr[:, np.newaxis]

combined_arr = np.append(combined_arr, y_arr, axis=1)

np.random.shuffle(combined_arr)

#Separate the augmented x_train and y_train again after shuffling
augmented_x_train = combined_arr[:, 0]
augmented_y_train = combined_arr[:, 1]

#Create a dictionary of augmented x and y
dataset_train = {'sms': augmented_x_train, 'label': augmented_y_train}

#Create a huggingface dataset from the dictionary
dataset_train = Dataset.from_dict(dataset_train)

#Encode the dataset_train label column features as ham and spam
dataset_train = dataset_train.class_encode_column('label')

dataset_train.features['label'].names = ['ham', 'spam']

#Create a DatasetDict from the resampled_dataset, which can be pushed to the huggingface hub
augmented_dataset = DatasetDict()
augmented_dataset['test'] = dataset['test']
augmented_dataset['train'] = dataset_train


### Augmented Context Substitute

In [None]:
# Substitute word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)

augmented_text = train_spam_sms.copy()

augmented_train_sms = []

#Run 6 iterations of augmenting synonyms to balance the dataset
for i in range(6):
  aug = naw.ContextualWordEmbsAug(
      model_path='bert-base-uncased', action="substitute", device="cuda")
  augmented_text = aug.augment(augmented_text)
  print(augmented_text)
  augmented_train_sms = augmented_train_sms + augmented_text

#Remove the extra 325 generated spam entires to ensure there is an equal amount of spam and non-spam
print("Length of augmented spam: ", len(augmented_train_sms))

In [None]:
#Print total spam amount and total ham amount (non-spam)
total_spam_count = len(augmented_train_sms) + len(train_spam_sms)
total_ham_count = len(dataset['train']['sms']) - len(train_spam_sms)
total_sms_count = total_spam_count + total_ham_count
print("Total spam: ", total_spam_count)
print("Total ham: ", total_ham_count)
print("Total sms: ", total_sms_count)

In [None]:
# Set x_train equal to the sms messages in the dataset train split
x_train = dataset['train']['sms']
# Set y_train equal to the labels in the dataset train split
y_train = dataset['train']['label']

# Set augmented x_train equal to x_train + the generated spam
augmented_x_train = x_train + augmented_train_sms

# Set augmented y_train equal to y_train plus a list of 1's of equal size to the generated spam
augmented_y_train = y_train + [1] * (len(augmented_x_train) - len(y_train))

#Combine the augmented x_train and y_train to shuffle together
combined_arr = np.array(augmented_x_train)
combined_arr = combined_arr[:, np.newaxis]
y_arr = np.array(augmented_y_train)
y_arr = y_arr[:, np.newaxis]

combined_arr = np.append(combined_arr, y_arr, axis=1)

np.random.shuffle(combined_arr)

#Separate the augmented x_train and y_train again after shuffling
augmented_x_train = combined_arr[:, 0]
augmented_y_train = combined_arr[:, 1]

#Create a dictionary of augmented x and y
dataset_train = {'sms': augmented_x_train, 'label': augmented_y_train}

#Create a huggingface dataset from the dictionary
dataset_train = Dataset.from_dict(dataset_train)

#Encode the dataset_train label column features as ham and spam
dataset_train = dataset_train.class_encode_column('label')

dataset_train.features['label'].names = ['ham', 'spam']

#Create a DatasetDict from the resampled_dataset, which can be pushed to the huggingface hub
augmented_dataset = DatasetDict()
augmented_dataset['test'] = dataset['test']
augmented_dataset['train'] = dataset_train

### Augmented Context Insert

In [None]:
# Insert word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)

augmented_text = train_spam_sms.copy()

augmented_train_sms = []

#Run 6 iterations of augmenting synonyms to balance the dataset
for i in range(6):
  aug = naw.ContextualWordEmbsAug(
      model_path='bert-base-uncased', action="insert", device="cuda")
  augmented_text = aug.augment(augmented_text)
  print(augmented_text)
  augmented_train_sms = augmented_train_sms + augmented_text

#Remove the extra 325 generated spam entires to ensure there is an equal amount of spam and non-spam
print("Length of augmented spam: ", len(augmented_train_sms))

In [None]:
#Print total spam amount and total ham amount (non-spam)
total_spam_count = len(augmented_train_sms) + len(train_spam_sms)
total_ham_count = len(dataset['train']['sms']) - len(train_spam_sms)
total_sms_count = total_spam_count + total_ham_count
print("Total spam: ", total_spam_count)
print("Total ham: ", total_ham_count)
print("Total sms: ", total_sms_count)

In [None]:
# Set x_train equal to the sms messages in the dataset train split
x_train = dataset['train']['sms']
# Set y_train equal to the labels in the dataset train split
y_train = dataset['train']['label']

# Set augmented x_train equal to x_train + the generated spam
augmented_x_train = x_train + augmented_train_sms

# Set augmented y_train equal to y_train plus a list of 1's of equal size to the generated spam
augmented_y_train = y_train + [1] * (len(augmented_x_train) - len(y_train))

#Combine the augmented x_train and y_train to shuffle together
combined_arr = np.array(augmented_x_train)
combined_arr = combined_arr[:, np.newaxis]
y_arr = np.array(augmented_y_train)
y_arr = y_arr[:, np.newaxis]

combined_arr = np.append(combined_arr, y_arr, axis=1)

np.random.shuffle(combined_arr)

#Separate the augmented x_train and y_train again after shuffling
augmented_x_train = combined_arr[:, 0]
augmented_y_train = combined_arr[:, 1]

#Create a dictionary of augmented x and y
dataset_train = {'sms': augmented_x_train, 'label': augmented_y_train}

#Create a huggingface dataset from the dictionary
dataset_train = Dataset.from_dict(dataset_train)

#Encode the dataset_train label column features as ham and spam
dataset_train = dataset_train.class_encode_column('label')

dataset_train.features['label'].names = ['ham', 'spam']

#Create a DatasetDict from the resampled_dataset, which can be pushed to the huggingface hub
augmented_dataset = DatasetDict()
augmented_dataset['test'] = dataset['test']
augmented_dataset['train'] = dataset_train

### Oversampled SMOTE

In [None]:
# The textTransformer pipeline allows for a quick vectorization and normalization of
# text to be done

textTransformer = Pipeline([
  ('vect', CountVectorizer(binary=True)),
  ('tfidf', TfidfTransformer(use_idf=True))
])

# Sets up the SMOTE oversampling hyperparamete and SVC model

sm = SMOTE(sampling_strategy='minority', k_neighbors = 5, random_state = 0)

clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0, warm_start=True)

# Transforms the String sms column of a pandas dataframe into a normalized, vectorized
# dense matrix representation

def transform(dataframe, fit=False):
  if fit:
    transformed = textTransformer.fit_transform(dataframe['sms'], dataframe['label'])
  else:
    transformed = textTransformer.transform(dataframe['sms'])
  toRet = pd.DataFrame(data=sparse.csr_matrix.todense(transformed))
  toRet.insert(0, 'label', dataframe['label'])
  return toRet

# Takes in a dataframe containing a dense matrix of vectorized SMS and their labels
# and oversamples the SPAM data using SMOTE

def oversample(dataframe):
  sparseDF = sparse.csr_matrix(dataframe.loc[:, dataframe.columns != "label"])
  X_res, y_res = sm.fit_resample(sparseDF, dataframe['label'])
  toSave = pd.DataFrame(data=sparse.csr_matrix.todense(X_res))
  toSave.insert(0, 'label', y_res)
  return toSave

# Takes in a dataframe containing a dense matrix of vectorized SMS and their labels
# and fits the SVC model onto the SMS and labels

def fitModel(model, dataframe):
  model.fit(sparse.csr_matrix(dataframe.loc[:, dataframe.columns != "label"]), dataframe['label'])

def fitModelFed(model, dataframe):
  model.fit(sparse.csr_matrix(dataframe.loc[:, dataframe.columns != "label"]), dataframe['label'])

# Takes in a dataframe and returns non-label data

def splitLabel(dataframe):
  return dataframe.loc[:, dataframe.columns != "label"]

In [None]:
# load_dataset allows us to load our pre-split dataset from huggingface

access_token = "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"
orig_ds = load_dataset("jyoung2247/sms_spam", use_auth_token=access_token)

# Sets up Jacob's mini-personal dataset

test_data2 = ["PayPal: 939618 is your security code. Don't share your code.", "Hello friend I am a prince", "Mom is calling you down to dinner", "My friend make you rich!", "hi", "asdasdxcadsadx"]
test_labels = [0, 1, 0, 1, 0, 1]

jacobDF = pd.DataFrame({'sms': test_data2, 'label': test_labels})

SMS_spam = ["FRM:www.GiftComps.store (Voucher#UFQKPX625) MSG:CVS Give-Away! $100 After 5min Survey.  (Voucher#UFQKPX625)",
            "Fr⁬⁮om: ͏⁫Am⁬a⁯z⁯on Ass⁫ista‍nce Message ID: 9086882 M⁭s‍g: We ha⁯ve dete‍cted a prob⁭lem wi⁬th y‌our acco⁪unt inform⁯ation. Ple⁮a͏se ve⁬ri⁭fy y⁫our info⁫r⁪mation cor⁭re⁭ctly. T͏h⁬is i⁪⁪s sim⁮⁭ple st‍e‍p to rec‌o⁮v⁬er yo⁭⁫ur acc‍⁬ount: 1. Si‌g⁪n in to yo‌u⁯r acc‌⁯ount. 2. Fol⁫͏low th͏e st‍e͏ps to s⁭ee y͏ou⁭r ca⁯‍se. 3. Reso⁬⁫lve t⁯he prob‌͏lem by comple⁯‌ting t‌h‍e in⁪st⁪r⁭uc⁬tion. Up⁪⁬date Infor⁫m‌ation H⁫e͏re : https://iuczbpe.shop/oRQZoUL Yo⁫⁮u ca‌n͏'t ac⁯ce‌ss yo⁪u⁭r acco͏u⁫nt un͏t⁬il t‍his proc⁫⁭ess com⁮p⁬lete.",
            "FRM:www.OprahGives.store (sms_ID=DYOWAPR453) MSG:Oprah's Biggest Give-Away Ever!  (sms_ID=DYOWAPR453)",
            "Catch loan quote fundsjoy.us..$ \"*^^\"",
            "FRM:www.ClaimRefund.info (sms_ID=IIBAONM948) MSG:Netflix is Reimbursing  $75.  Time Left to Claim (36) hr.  (sms_ID=IIBAONM948)",
            "FRM:www.Oprahs.diet|       (Voucher#DIHF514) MSG:We're Blasting 0prahs Brand New Essential Sweet Dietary Fruit Candy Give-Away  (Voucher#DIHF514)",
            "JPMORG-BANKING| Online access was limited to unusal activity. Please verify now at:Https://secu34jpmorg.com/?verify",
            "Hey Detimmeyon. After review fromCity, you have up to 3175 ready. yp0ydv.com/4c3fb3050d",
            "Cash Received : A payment is set to deposit in the next hour! Text with our special pin of : 0398",
            "With Autoinsurancemate check full coverage available at 28/month on switch.autoinsurancemate.com Reply STOP to opt out",
            "grlbooknow24.me %*&85",
            "(Alert) Antiviru's Is Not Act!v@ted!: kejh.info/91uRmU"]

SMS_verifications = ["Account: 743902 is your Samsung account verification code.", "Your publix.com verification code is: 4871", "196558 is your Yahoo verification code",
                     "Your Discord verification code is: 894635", "Your DoorDash verification code is 589421. Do not share this with anyone. We will never contact you to request this code", "PayPal: 539618 is your security code. Don't share your code"]

SMS_spam_true_labels = [1] * 12

SMS_verifications_true_labels = [0] * 6

jacob_msg_all = SMS_spam + SMS_verifications

In [None]:
# Since fitted SVC model requires same number of features, we first fit_transform
# the textTransformer onto the training data. Then, we transform the test data
# for use in predicting

train_data = transform(pd.DataFrame.from_dict(orig_ds['train']), fit=True)
test_data = transform(pd.DataFrame.from_dict(orig_ds['test']), fit=False)

jacob_msg = textTransformer.transform(jacob_msg_all)
jacob_msg_labels = SMS_spam_true_labels + SMS_verifications_true_labels

# After transforming all the data, we user SMOTE to oversample the SPAM in the training
# data before fitting our PassiveAggressiveClassifier onto it

oversampled = oversample(train_data)

# # Converts the newly transformed and oversampling training and testing data into
# # a DatasetDict to save in huggingface

# dataToSendTrain = Dataset.from_pandas(oversampled)
# dataToSendTest = Dataset.from_pandas(test)

# resampled_dataset_SMOTE = DatasetDict({'train': dataToSendTrain, 'test':dataToSendTest})

# # Option for CSV

# test.to_csv('SMOTE_Testing.csv', index=False)

# resampled_dataset_SMOTE.push_to_hub("jyoung2247/sms_spam_oversampled_SMOTE", private=True)

## Centralized Model

### Centralized Model setup

In [None]:
#Check count of spam in dataset
test_labels = dataset['test']['label']
print("Total test size", len(test_labels))
test_spam_inds = np.argwhere(test_labels)
print("Count of spam in test set", test_spam_inds.shape[0])

train_labels = dataset['train']['label']
print("Total train size", len(train_labels))
train_spam_inds = np.argwhere(train_labels)
print("Count of spam in train set", train_spam_inds.shape[0])

train_spam_sms = dataset['train']['sms']
train_spam_sms = np.array(train_spam_sms)
train_spam_sms = train_spam_sms[train_spam_inds]
print(train_spam_sms.shape)
train_spam_sms = train_spam_sms.ravel()
train_spam_sms = train_spam_sms.tolist()

In [None]:
#Tokenize input for use with bert-tiny

tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")


def tokenize_function(examples):
    return tokenizer(examples["sms"], padding="max_length", truncation=True, max_length=512)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
#Load metrics

metric_f1 = evaluate.load("f1")
metric_acc = evaluate.load("accuracy")
metric_prec = evaluate.load("precision")
metric_rec = evaluate.load("recall")

In [None]:
#Function to compute metrics

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"f1": metric_f1.compute(predictions=predictions, references=labels, average="micro"), "acc": metric_acc.compute(predictions=predictions, references=labels), "prec": metric_prec.compute(predictions=predictions, references=labels), "rec": metric_rec.compute(predictions=predictions, references=labels)}

In [None]:
#Set training arguments. Most are left as default

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=2)

### Train centralized model

In [None]:
#Use GPU if available and initialize pretrained model

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2).to(device)

In [None]:
#Initialize huggingface trainer

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_datasets['train'],
   eval_dataset=tokenized_datasets['test'],
   compute_metrics=compute_metrics,
)

In [None]:
#Train the model
trainer.train()

In [None]:
#Mount google drive to save model or access model weights
drive.mount('/content/drive/', force_remount=True)

In [None]:
#Save model weights
trainer.save_model("/content/drive/MyDrive/CS6220/model_base2")

### Load saved centralized model

In [None]:
#Load model with saved weights
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/CS6220/model_base")

In [None]:
#Create trainer object

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

### Evaluate centralized model

In [None]:
#Run to Evaluate model

trainer.evaluate()

### Predict a single SMS label with model

In [None]:
SMS_to_predict = "testSMS"

encoding = tokenizer(SMS_to_predict, padding="max_length", truncation=True, return_tensors='pt', max_length=512)
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)
logits = outputs.logits

softmax = torch.nn.Softmax(dim=0)
probs = softmax(logits.squeeze())

predictionProb, predictionIndex = torch.max(probs, axis=0)

print(predictionProb)
print(predictionIndex)

#get label from prediction
labels = dataset["train"].features["label"].names
print(labels[predictionIndex.item()])
print(predictionProb.item())

### Generate confusion matrix on dataset test data

In [None]:
#Select part of the dataset to predict and evaluate accuracy. GPU memory is limited so we can't predict the entire test set at once

test_data = dataset["test"]
y_test = test_data["sms"][0:1000]
y_true = test_data["label"][0:1000]
encoding = tokenizer(y_test, padding="max_length", truncation=True, return_tensors='pt', max_length=512)
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [None]:
logits = outputs.logits
logits.shape

In [None]:
softmax = torch.nn.Softmax(dim=0)
probs = softmax(logits.squeeze())
predictionProbs, predictionIndicies = torch.max(probs, dim=1)
y_pred = predictionIndicies.tolist()

cm = confusion_matrix(y_true, y_pred, labels=[0, 1])

In [None]:
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Legitimate', 'Spam']).plot()

### Generate confusion matrix on real world spam

In [None]:
#Evaluate model's performance on real world spam and verification text messages

SMS_spam = ["FRM:www.GiftComps.store (Voucher#UFQKPX625) MSG:CVS Give-Away! $100 After 5min Survey.  (Voucher#UFQKPX625)",
            "Fr⁬⁮om: ͏⁫Am⁬a⁯z⁯on Ass⁫ista‍nce Message ID: 9086882 M⁭s‍g: We ha⁯ve dete‍cted a prob⁭lem wi⁬th y‌our acco⁪unt inform⁯ation. Ple⁮a͏se ve⁬ri⁭fy y⁫our info⁫r⁪mation cor⁭re⁭ctly. T͏h⁬is i⁪⁪s sim⁮⁭ple st‍e‍p to rec‌o⁮v⁬er yo⁭⁫ur acc‍⁬ount: 1. Si‌g⁪n in to yo‌u⁯r acc‌⁯ount. 2. Fol⁫͏low th͏e st‍e͏ps to s⁭ee y͏ou⁭r ca⁯‍se. 3. Reso⁬⁫lve t⁯he prob‌͏lem by comple⁯‌ting t‌h‍e in⁪st⁪r⁭uc⁬tion. Up⁪⁬date Infor⁫m‌ation H⁫e͏re : https://iuczbpe.shop/oRQZoUL Yo⁫⁮u ca‌n͏'t ac⁯ce‌ss yo⁪u⁭r acco͏u⁫nt un͏t⁬il t‍his proc⁫⁭ess com⁮p⁬lete.",
            "FRM:www.OprahGives.store (sms_ID=DYOWAPR453) MSG:Oprah's Biggest Give-Away Ever!  (sms_ID=DYOWAPR453)",
            "Catch loan quote fundsjoy.us..$ \"*^^\"",
            "FRM:www.ClaimRefund.info (sms_ID=IIBAONM948) MSG:Netflix is Reimbursing  $75.  Time Left to Claim (36) hr.  (sms_ID=IIBAONM948)",
            "FRM:www.Oprahs.diet|       (Voucher#DIHF514) MSG:We're Blasting 0prahs Brand New Essential Sweet Dietary Fruit Candy Give-Away  (Voucher#DIHF514)",
            "JPMORG-BANKING| Online access was limited to unusal activity. Please verify now at:Https://secu34jpmorg.com/?verify",
            "Hey Detimmeyon. After review fromCity, you have up to 3175 ready. yp0ydv.com/4c3fb3050d",
            "Cash Received : A payment is set to deposit in the next hour! Text with our special pin of : 0398",
            "With Autoinsurancemate check full coverage available at 28/month on switch.autoinsurancemate.com Reply STOP to opt out",
            "grlbooknow24.me %*&85",
            "(Alert) Antiviru's Is Not Act!v@ted!: kejh.info/91uRmU"]


SMS_spam_true_labels = [1] * 12

encoding = tokenizer(SMS_spam, padding="max_length", truncation=True, return_tensors='pt', max_length=512)
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [None]:
logits = outputs.logits
logits.shape
y_true = SMS_spam_true_labels

In [None]:
softmax = torch.nn.Softmax(dim=0)
probs = softmax(logits.squeeze())
predictionProbs, predictionIndicies = torch.max(probs, dim=1)
y_pred = predictionIndicies.tolist()

cm = confusion_matrix(y_true, y_pred, labels=[0, 1])

In [None]:
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Legitimate', 'Spam']).plot()

### Generate confusion matrix on real world verification sms messages

In [None]:
#Evaluate model's performance on real world verification text messages


SMS_verifications = ["Account: 743902 is your Samsung account verification code.", "Your publix.com verification code is: 4871", "196558 is your Yahoo verification code",
                     "Your Discord verification code is: 894635", "Your DoorDash verification code is 589421. Do not share this with anyone. We will never contact you to request this code", "PayPal: 539618 is your security code. Don't share your code"]

SMS_verifications_true_labels = [0] * 6

encoding = tokenizer(SMS_verifications, padding="max_length", truncation=True, return_tensors='pt')
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [None]:
logits = outputs.logits
logits.shape
y_true = SMS_verifications_true_labels

In [None]:
softmax = torch.nn.Softmax()
probs = softmax(logits.squeeze())
predictionProbs, predictionIndicies = torch.max(probs, dim=1)
y_pred = predictionIndicies.tolist()

cm = confusion_matrix(y_true, y_pred, labels=[0, 1])

In [None]:
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Legitimate', 'Spam']).plot()

In [None]:
#Sources: https://huggingface.co/docs/transformers/training
#https://discuss.huggingface.co/t/how-to-save-my-model-to-use-it-later/20568
#https://colab.research.google.com/drive/1U7SX7jNYsNQG5BY1xEQQHu48Pn6Vgnyt?usp=sharing

### SMOTE Centralized Model

In [None]:
# Fit PassiveAggressiveClassifier onto Data

fitModel(clf, oversampled)

In [None]:

# Generate score from testing data

clf.score(splitLabel(test), test['label'])

# Generate Confusion Matrix from testing data

predictions1 = clf.predict(splitLabel(test))
cm1 = confusion_matrix(test['label'], predictions1)
ConfusionMatrixDisplay(confusion_matrix=cm1, display_labels=['Legitimate', 'Spam']).plot()

scores = precision_recall_fscore_support(test['label'], predictions1, average='macro')
print(clf.score(splitLabel(test), test['label']))
print(scores)

In [None]:
# Transform jacob's dataset

jacob = transform(pd.DataFrame.from_dict(jacobDF), fit=False)

# Generate score from jacob's data

clf.score(splitLabel(jacob), jacob['label'])

# Generate Confusion Matrix from jacob's data


predictions2 = clf.predict(splitLabel(jacob))
cm2 = confusion_matrix(jacob['label'], predictions2)
ConfusionMatrixDisplay(confusion_matrix=cm2, display_labels=['Legitimate', 'Spam']).plot()

In [None]:
# jacobscores = precision_recall_fscore_support(jacob['label'], predictions2, average='macro')
# print(clf.score(splitLabel(jacob), jacob['label']))
# print(jacobscores)

# predictions3 = clf.predict(jacob_msg)
# cm3 = confusion_matrix(splitLabel(jacob), predictions2)
# ConfusionMatrixDisplay(confusion_matrix=cm3, display_labels=['Legitimate', 'Spam']).plot()

In [None]:
# jacobscores2 = precision_recall_fscore_support(jacob_msg_labels, predictions3, average='macro')
# print(clf.score(jacob_msg, jacob_msg_labels))
# print(jacobscores2)

# predictions3 = clf.predict(jacob_msg)
# cm3 = confusion_matrix(jacob_msg_labels, predictions3)
# ConfusionMatrixDisplay(confusion_matrix=cm3, display_labels=['Legitimate', 'Spam']).plot()


## Federated Learning Models

### Data Partitioning

In [None]:
dataset_names = [
  'sms_spam_augmented_synonyms',
  'sms_spam_augmented_context_insert',
  'sms_spam_augmented_context_substitute',
  'sms_spam_undersampled_CNN',
  'sms_spam_undersampled_random',
  'sms_spam_naive_oversampled',    
  'sms_spam_oversampled_SMOTE',
]

def get_splits(train_labels, percent_spam):
  
  train_labels = np.asarray(train_labels).astype("int")
  num_spam = len(train_labels[train_labels == 1])
  client_size = len(train_labels) // 5
  # leftovers = len(train_labels) % 5
  spam_counts = []
  for i, percent in enumerate(percent_spam):
    spam_counts.append(floor(percent * client_size))
  legit_counts = spam_counts[::-1]
  leftovers = int(len(train_labels) / 2 - sum(spam_counts))
  # print(leftovers)
  # [0, 1, 2, 3, 4]
  # print(legit_counts)
  # print(spam_counts)
  for i in range(leftovers):
    spam_percentage = percent_spam[i % 5]
    if spam_percentage == 1:
      spam_counts[i % 5] += 1
      legit_counts[4 - (i % 5)] += 1
    elif spam_percentage == 0:
      legit_counts[i % 5] += 1
      spam_counts[4 - (i % 5)] += 1
    else:
      legit_counts[i % 5] += 1
      spam_counts[i % 5] += 1
  # print(legit_counts)
  # print(spam_counts)
  # print()
  
  assert(sum(spam_counts) == sum(legit_counts))
  assert(sum(spam_counts) + sum(legit_counts) == len(train_labels)), f'Difference is {len(train_labels) - sum(spam_counts) - sum(legit_counts)}'
  return list(zip(legit_counts, spam_counts))


def split(train_labels, split_vals):
  legit_indices = np.argwhere(1 - train_labels).flatten()
  spam_indices = np.argwhere(train_labels).flatten()
  indices = []
  for legit_count, spam_count in split_vals:
    legit = np.random.choice(legit_indices, legit_count, replace=False).flatten()
    spam = np.random.choice(spam_indices, spam_count, replace=False).flatten()
    combined = np.hstack((legit, spam))
    np.random.shuffle(combined)
    indices.append(combined)
  return indices



def generate_splits(trainingData):
  # train_data = np.asarray(dataset['train']['sms'])
    # train_labels = np.asarray(dataset['train']['label'])
    if type(trainingData) != type(pd.DataFrame()):
      trainingData = trainingData['train']
    train_labels = np.asarray(trainingData['label']) # SMOTE
    print("Num Spam: ", len(train_labels[train_labels == 1]), "Num Legitimate: ", len(train_labels[train_labels == 0]))

    # train_data = np.asarray(df.drop(['label'], axis=1).to_numpy())
    # train_labels = np.asarray(df['label'].to_numpy())
    # del df

    # letters = string.ascii_lowercase
    # train_data = np.array([''.join(random.choice(letters) for i in range(10)) for _ in range(7396)])
    # train_labels = np.hstack((np.zeros(3698), np.ones(3698)))
    # np.random.shuffle(train_labels)

    # train_data_legit = train_data[train_labels == 0]#[:598]
    # train_data_spam = train_data[train_labels == 1]
    train_labels_legit = train_labels[train_labels == 0]#[:598]
    train_labels_spam = train_labels[train_labels == 1]
    # print(len(train_labels_spam), len(train_labels_legit))

    '''
    EDIT THESE THREE ARRAYS BELOW TO CHANGE PROB DISTRIBUTION (ARRAY SPECIFIES SPAM DISTRIBUTION)

    THEY MUST AVERAGE OUT TO 0.5 OR THIS WONT WORK,

    IF YOU WANT TO MAKE THE SPLITS YOURSELF, DO SOMETHING LIMILAR TO THE COMMENTED OUT LINE BELOW

    MAKE A LIST OF TUPLES, ONE TUPLE FOR EACH CLIENT (NUM_LEGITIMATE, NUM_SPAM)
    '''
    split_1_vals = get_splits(train_labels, [.5, .5, .5, .5, .5]) 
    #split_1_vals = [(2250, 250), (400, 100), (248, 248), (481, 0), (482, 0)] # Percent Spam: [10, 20, 50, 0, 0]
    split_2_vals = get_splits(train_labels, [0, 0, .5, 1, 1]) # [(600, 150), (0, 200), (594, 248), (1333, 0), (1334, 0)] # Percent Spam: [20. 100, 33, 0, 0]
    split_3_vals = get_splits(train_labels, [0.15, 0.3, .5, .7, .85])# [(772, 119), (772, 119), (772, 120), (772, 120), (773, 120)] # Percent Spam: [13, 13, 13, 13, 13]
    print(split_2_vals)


    split_1_indices = split(train_labels, split_1_vals)
    split_2_indices = split(train_labels, split_2_vals)
    split_3_indices = split(train_labels, split_3_vals)
    total_data = [split_1_indices, split_2_indices, split_3_indices]

    dataframe_info = {
        'indices': [],
        'experiment': [],
        'client_id': [],
    }

    for experiment_no in range(3):
      for client_id in range(5):
        curr_data = total_data[experiment_no][client_id]
        experiment = [int(experiment_no)] * len(curr_data)
        client_ids = [int(client_id)] * len(curr_data)
        dataframe_info['indices'] += curr_data.astype('int').tolist()
        dataframe_info['experiment'] += experiment
        dataframe_info['client_id'] += client_ids

    for arr in dataframe_info.values():
      print(len(arr))

    df = pd.DataFrame.from_dict(dataframe_info)
    # df.head()
    # from huggingface_hub import login

    # login()

    # from datasets import Dataset
    # from datasets import DatasetDict

    # fed_split = Dataset.from_dict(dataframe_info)

    # fed_split.push_to_hub(f"jyoung2247/split_{dataset_name}", private=True)

    return df
  

### Federated Learning Models

In [None]:
def train(net, trainloader, epochs, device='cpu'):
    optimizer = torch.optim.AdamW(net.parameters(), lr=5e-5)
    net.train()
    for _ in range(epochs):
        for batch in trainloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = net(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()


def test(net, testloader, device='cpu'):
    loss = 0
    _metric_f1 = evaluate.load("f1")
    _metric_acc = evaluate.load("accuracy")
    _metric_prec = evaluate.load("precision")
    _metric_rec = evaluate.load("recall")

    net.eval()
    for batch in testloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = net(**batch)
        logits = outputs.logits
        loss += outputs.loss.item()
        predictions = torch.argmax(logits, dim=-1)
        
        _metric_f1.add_batch(predictions=predictions, references=batch['labels'])
        _metric_acc.add_batch(predictions=predictions, references=batch['labels'])
        _metric_prec.add_batch(predictions=predictions, references=batch['labels'])
        _metric_rec.add_batch(predictions=predictions, references=batch['labels']) 

    loss /= len(testloader.dataset)

    metrics = {'f1': _metric_f1.compute()['f1'], 'acc': _metric_acc.compute()['accuracy'], 'prec': _metric_prec.compute()['precision'], 'rec':_metric_rec.compute()['recall']} 
    
    return loss, metrics

raw_datasets = load_dataset("jyoung2247/sms_spam", use_auth_token=access_token)


# Change for dataset v
mod_dataset = None
experiment = -1

df = None


def load_data(cid, batch_size=32):

    if experiment != -1:
        cid = int(cid)
        training = pd.DataFrame.from_dict(mod_dataset['train'])
        data = training.iloc[df.loc[(df['experiment'] == experiment) * (df['client_id'] == cid)]['indices']]
        raw_datasets['train'] = Dataset.from_pandas(data)
        raw_datasets['train'] = raw_datasets['train'].remove_columns(['__index_level_0__'])
      
    else:
        raw_datasets['train'] = mod_dataset['train']
        n = len(raw_datasets['train'])
        population_0 = random.sample([i for i in range(n)], n//5)


    tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
    def tokenize_function(examples):
        examples['label'] = [int(x) for x in examples['label']]
        return tokenizer(examples["sms"], padding='max_length', truncation=True, max_length=512)


    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    if experiment == -1:
        tokenized_datasets["train"] = tokenized_datasets["train"].select(population_0)
    tokenized_datasets["test"] = tokenized_datasets["test"] 
    st1 = sum(tokenized_datasets['train']['label'])
    print(len(tokenized_datasets['train']) - st1, st1)

    data_collator = DefaultDataCollator()
    trainloader = DataLoader(
        tokenized_datasets["train"],
        shuffle=True,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    testloader = DataLoader(
        tokenized_datasets["test"], 
        batch_size=batch_size, 
        collate_fn=data_collator,
    )
    return trainloader, testloader


class FlowerClient(fl.client.NumPyClient):
    def __init__(self, cid, checkpoint_name='fed_boi') -> None:
        super().__init__()
        self.cid = cid
        access_token = "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"
        net = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny",
        num_labels=2, use_auth_token=access_token)
        self.device='cuda:0' 
        self.trainloader, self.testloader = load_data(cid, 8)
        self.net = net.to(self.device)
        self.checkpoint_name = checkpoint_name
        
    def get_parameters(self, config):
        return [val.cpu().numpy() for _, val in self.net.state_dict().items()]

    def set_parameters(self, parameters):
        params_dict = zip(self.net.state_dict().keys(), parameters)
        state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
        self.net.load_state_dict(state_dict, strict=True)

    def fit(self, parameters, config):
        self.set_parameters(parameters)
        print("Training Started...")
        train(self.net, self.trainloader, epochs=1, device=self.device)
        print("Training Finished.")
        self.net.save_pretrained(f"/content/drive/MyDrive/CS6220/{self.checkpoint_name}")
        return self.get_parameters(config={}), len(self.trainloader), {}

    def evaluate(self, parameters, config):
        self.set_parameters(parameters)
        loss, metrics = test(self.net, self.testloader, device=self.device)
        return float(loss), len(self.testloader), metrics


  
def fedmlExp(checkpoint_name = 'fed_boi'):
    pool_size = 5 
    
    def get_evaluate_fn():
        def eval(server_round, parameters, config):
            model = FlowerClient(0)
            device = torch.device('cuda:0')
            model.set_parameters(parameters)
            model.net.to(device)  
            loss, _, metrics = model.evaluate(parameters, {})
            return loss, metrics

        return eval
    def fit_config(server_round):
        return {}
    
    strategy = fl.server.strategy.FedAvg(
        fraction_fit=0.1,
        fraction_evaluate=0.1,
        min_fit_clients=pool_size,
        min_evaluate_clients=1,
        min_available_clients=pool_size,  # All clients should be available
        on_fit_config_fn=fit_config,
        evaluate_fn=get_evaluate_fn(),  # centralised evaluation of global mode
    )

    def client_fn(cid):
        return FlowerClient(cid, checkpoint_name)
    
    client_resources = {
       "num_cpus": 1,
       "num_gpus": 0.2,
    } 

    # start simulation
    fl.simulation.start_simulation(
        client_fn=client_fn,
        num_clients=pool_size,
        config=fl.server.ServerConfig(num_rounds=4),
        client_resources = client_resources,
        strategy=strategy,
    )


In [None]:
# SMOTE


os.makedirs('/content/params', exist_ok=True)
df = generate_splits(oversampled)

def smoteFL():

    # access_token = "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"
    # raw_datasets = load_dataset("jyoung2247/sms_spam_oversampled_SMOTE", use_auth_token=access_token)

    def getData(cid):
      cid = int(cid)
      #Set df['experiment'] to desired split ratio: 0- [.5, .5, .5, .5, .5], 1- [0, 0, .5, 1, 1], 2- [.15, .30, .50, .70, .85]

      data = oversampled.iloc[df.loc[(df['experiment'] == experiment) * (df['client_id'] == cid)]['indices']]
      X_train = splitLabel(data)
      y_train = data['label']
      return X_train, y_train

    def getDataRandom(cid):
      cid = int(cid)
      data = splits[cid]
      X_train = splitLabel(data)
      y_train = data['label']
      return X_train, y_train
    
    X_test = splitLabel(test_data)
    y_test = test_data['label']


    rf = PassiveAggressiveClassifier(max_iter=1000, random_state=0, warm_start=True)

    class SmoteClient(fl.client.NumPyClient):
      def __init__(self, cid ) -> None:
          super().__init__()
          self.cid = cid
          
          # self.rf = RandomForestClassifier(max_depth=2, random_state=0, warm_start=True)
      def get_parameters(self, config):
             
          params = rf.get_params()
          p = []
          if hasattr(rf, 'coef_'):
            p = [
              rf.coef_,
              rf.intercept_,
              rf.classes_,
            ]
        
        
          return p
          # for k,v in params.items():
          #   if isinstance(v, np.ndarray) or isinstance(v, float) or isinstance(v, int):
          #     np.save(f'/content/params/{k}::{self.cid}.npy', v, allow_pickle=True)
          #     p.append(f'/content/params/{k}::{self.cid}.npy') 
          # return p

      def set_parameters(self, parameters):
          if len(parameters) != 0:
            rf.coef_ = parameters[0]
            rf.intercept_ = parameters[1]
            rf.classes_ = parameters[2]

      def fit(self, parameters, config):
          if config['rnd'] > 1:
            self.set_parameters(parameters)
          print("Training Started...")
          if experiment > -1:
            X_train, y_train = getData(self.cid)
          else:
            X_train, y_train = getDataRandom(self.cid)
          rf.fit(X_train, y_train)
          print("Training Finished.")
          # self.net.save_pretrained("/content/drive/MyDrive/CS6220/fed_boi_1")
          
          return self.get_parameters(config), len(X_train), {}

      def evaluate(self, parameters, config):
        
          self.set_parameters(parameters)
          accuracy = rf.score(X_test, y_test)
          y_pred = rf.predict(X_test)
          p, r, f, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
          return accuracy, len(X_test), {'acc': accuracy, 'prec': p, 'rec': r, 'f1': f}
        
    pool_size = 5
    
    def get_evaluate_fn():
        def eval(server_round, parameters, config):
            if server_round > 1:
              model = SmoteClient(0)
              model.set_parameters(parameters)
              accuracy, _, metric = model.evaluate(parameters, {})
              return accuracy, metric
            return 0.0, {}
        return eval


    def fit_config(server_round):
        return {'rnd': server_round}


    strategy = fl.server.strategy.FedAvg(
        fraction_fit=0.1,
        # fraction_evaluate=0.1,
        min_fit_clients=pool_size,
        min_evaluate_clients=1,
        min_available_clients=pool_size,  # All clients should be available
        on_fit_config_fn=fit_config,
        evaluate_fn=get_evaluate_fn(),  # centralised evaluation of global mode
    )

    def client_fn(cid):
        return SmoteClient(cid)
    
    client_resources = {
       "num_cpus": 1,
       "num_gpus": 0.0,
    } 

    # start simulation
    fl.simulation.start_simulation(
        client_fn=client_fn,
        num_clients=pool_size,
        config=fl.server.ServerConfig(num_rounds=3),
        client_resources = client_resources,
        strategy=strategy,
    )




### Federated Learning Models - Unbalanced

In [None]:
mod_dataset = load_dataset(f"jyoung2247/sms_spam", use_auth_token=access_token)
df = generate_splits(mod_dataset)
experiment = -1
fedmlExp('fed_boi_0_0')


In [None]:
experiment = 0
fedmlExp('fed_boi_0_1')

In [None]:
experiment = 1
fedmlExp("fed_boi_0_2')

In [None]:
experiment = 2
fedmlExp("fed_boi_0_3")

### Federated Learning Models - Naive Random Oversampling

In [None]:
mod_dataset = load_dataset(f"jyoung2247/sms_spam_random_oversampled", use_auth_token=access_token)
df = generate_splits(mod_dataset)

experiment = -1
fedmlExp('fed_boi_1_0')

In [None]:
experiment = 0
fedmlExp('fed_boi_1_1')

In [None]:
experiment = 1
fedmlExp('fed_boi_1_2')

In [None]:
experiment = 2
fedmlExp('fed_boi_1_3')

### Federated Learning Models - Synonym Oversampling

In [None]:
mod_dataset = load_dataset(f"jyoung2247/sms_spam_augmented_synonyms", use_auth_token=access_token)
df = generate_splits(mod_dataset)

experiment = -1
fedmlExp('fed_boi_2_0')

In [None]:
experiment = 0
fedmlExp('fed_boi_2_1')

In [None]:
experiment = 1
fedmlExp('fed_boi_2_2')

In [None]:
experiment = 2
fedmlExp('fed_boi_2_3')

### Federated Learning Models - Context Substitute Oversampling

In [None]:
mod_dataset = load_dataset(f"jyoung2247/sms_spam_augmented_context_substitute", use_auth_token=access_token)
df = generate_splits(mod_dataset)

experiment = -1
fedmlExp('fed_boi_3_0')

In [None]:
experiment = 0
fedmlExp('fed_boi_3_1')

In [None]:
experiment = 1
fedmlExp('fed_boi_3_2')

In [None]:
experiment = 2
fedmlExp('fed_boi_3_3')

### Federated Learning Models - Context Insert Oversampling

In [None]:
mod_dataset = load_dataset(f"jyoung2247/sms_spam_augmented_context_insert", use_auth_token=access_token)
df = generate_splits(mod_dataset)

experiment = -1
fedmlExp('fed_boi_4_0')

In [None]:
experiment = 0
fedmlExp('fed_boi_4_1')

In [None]:
experiment = -1
fedmlExp('fed_boi_4_2')

In [None]:
experiment = -1
fedmlExp('fed_boi_4_3')

### Federated Learning Models - SMOTE Oversampling

In [None]:
oversampledShuffle = oversampled.sample(frac=1)
splits = np.array_split(oversampledShuffle, 5)

In [None]:
# mod_dataset = load_dataset(f"jyoung2247/sms_spam_oversampled_SMOTE", use_auth_token=access_token)
mod_dataset = oversampled
df = generate_splits(mod_dataset)

experiment = -1
smoteFL()

In [None]:
experiment = 0
smoteFL()

In [None]:
experiment = 1
smoteFL()

In [None]:
experiment = 2
smoteFL()

### Federated Learning Models - Naive Random Undersampling

In [None]:
mod_dataset = load_dataset(f"jyoung2247/sms_spam_undersampled_random", use_auth_token=access_token)
df = generate_splits(mod_dataset)

experiment = -1
fedmlExp('fed_boi_5_0')

In [None]:
experiment = 0
fedmlExp('fed_boi_5_1')

In [None]:
experiment = 1
fedmlExp('fed_boi_5_2')

In [None]:
experiment = 2
fedmlExp('fed_boi_5_3')

###Federated Learning Models - CNN Undersampling

In [None]:
mod_dataset = load_dataset(f"jyoung2247/sms_spam_undersampled_CNN", use_auth_token=access_token)
df = generate_splits(mod_dataset)

experiment = -1
fedmlExp('fed_boi_6_0')

In [None]:
experiment = 0
fedmlExp('fed_boi_6_1')

In [None]:
experiment = 1
fedmlExp('fed_boi_6_2')

In [None]:
experiment = 2
fedmlExp('fed_boi_6_3')

In [None]:
!rm -rf /content/params

### Predict a single SMS label with Federated model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/CS6220/fed_boi_0_0")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

SMS_to_predict = "testSMS"

encoding = tokenizer(SMS_to_predict, padding="max_length", truncation=True, return_tensors='pt', max_length=512)
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)
logits = outputs.logits

softmax = torch.nn.Softmax(dim=0)
probs = softmax(logits.squeeze())

predictionProb, predictionIndex = torch.max(probs, axis=0)

print(predictionProb)
print(predictionIndex)

#get label from prediction
labels = dataset["train"].features["label"].names
print(labels[predictionIndex.item()])
print(predictionProb.item())