In [None]:
!pip install transformers

import torch
from transformers import RobertaModel, RobertaTokenizer

import matplotlib.pyplot as plt
% matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn import feature_extraction, model_selection, naive_bayes, metrics, svm
import pandas as pd
import numpy as np
import math
from tqdm import tqdm

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 1.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 13.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 39.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 8.1 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacrem

# Create RoBERTa tokenizer and model

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

model = RobertaModel.from_pretrained('roberta-base',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#Function to convert texts into dataFrame
1. Convert sentence to embedding
2. Create dataFrame where each row is sentence embedding

Inputs:
1. text: A 2d array of all sentences 
#####ex [["The dog"], ["A cat"]]

2. labels: An array of labels for each sentence where 1 = spam and 0 = ham 
#####ex [1 0] 

3. model: Bert pretrained model

In [None]:
def sen_2_embed(text, sent_num, model):
  # Add special tokens for start and end of sentence
  # marked_text = "[CLS] " + text + " [SEP]"

  #Convert review into list of tokens
  tokenized_text = tokenizer.tokenize(text)

  #Convert each token to its index
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

  #Consider each review to be one sentence with label sent_num
  segments_ids = [sent_num] * len(tokenized_text)

  #Convert indexed tokens and segment ids to tensor
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  #Put model into envaluation mode
  model.eval()

  #Get output states from model
  with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensors)
    hidden_states = outputs[2]

  #Take the second to last hidden layer
  token_vecs = hidden_states[-2][0]

  # Want to get a single vector for our entire sentence
  # Simple approach: Average the second to last hidden layer of each token 
  #   producing a single 768 length vector
  sentence_embedding = torch.mean(token_vecs, dim=0)

  # Returns numpy array each row is vector representation of a review
  return sentence_embedding.unsqueeze(0).numpy()


def texts_2_df(texts, labels, model):
  #Get the vector for first review
  sen_embeds = sen_2_embed(texts[0], 1, model)

  #Get vector of rest of reviews and add to end of array
  for i in range(1, len(texts)):
    sen_embed = sen_2_embed(texts[i], i+1, model)
    sen_embeds = np.concatenate((sen_embeds, sen_embed), axis = 0)

  #Convert numpy array of vectors to dataFrame to be used by random forest
  df = pd.DataFrame(sen_embeds)

  #If training, add labels column to end of dataframe
  if labels:
    df['target'] = labels
  return df
    

# Download Spam dataset

In [None]:
!pip install datasets
import datasets
dataset = datasets.load_dataset('sms_spam')

Collecting datasets
  Downloading datasets-2.2.1-py3-none-any.whl (342 kB)
[?25l[K     |█                               | 10 kB 18.3 MB/s eta 0:00:01[K     |██                              | 20 kB 9.0 MB/s eta 0:00:01[K     |██▉                             | 30 kB 8.9 MB/s eta 0:00:01[K     |███▉                            | 40 kB 7.6 MB/s eta 0:00:01[K     |████▉                           | 51 kB 4.0 MB/s eta 0:00:01[K     |█████▊                          | 61 kB 4.7 MB/s eta 0:00:01[K     |██████▊                         | 71 kB 5.0 MB/s eta 0:00:01[K     |███████▋                        | 81 kB 4.2 MB/s eta 0:00:01[K     |████████▋                       | 92 kB 4.7 MB/s eta 0:00:01[K     |█████████▋                      | 102 kB 5.1 MB/s eta 0:00:01[K     |██████████▌                     | 112 kB 5.1 MB/s eta 0:00:01[K     |███████████▌                    | 122 kB 5.1 MB/s eta 0:00:01[K     |████████████▌                   | 133 kB 5.1 MB/s eta 0:00:01[K

Downloading builder script:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/901 [00:00<?, ?B/s]

Downloading and preparing dataset sms_spam/plain_text (download: 198.65 KiB, generated: 509.53 KiB, post-processed: Unknown size, total: 708.17 KiB) to /root/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c...


Downloading data:   0%|          | 0.00/203k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5574 [00:00<?, ? examples/s]

Dataset sms_spam downloaded and prepared to /root/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

#Create: 
*   X_train 
*   y_train
*   X_test
*   y_test

In [None]:
train_len = math.ceil(len(dataset['train']) * 0.7)
train_reviews = [dataset['train'][i]['sms'] for i in range(train_len)]
train_labels = [dataset['train'][i]['label'] for i in range(train_len)]

In [None]:
train_df = texts_2_df(train_reviews, train_labels, model)

In [None]:
X_train = train_df.drop(['target'], axis=1)
y_train = train_df['target']

In [None]:
tot_len = len(dataset['train'])
test_reviews = [dataset['train'][i]['sms'] for i in range(train_len, tot_len)]
test_labels = [dataset['train'][i]['label'] for i in range(train_len, tot_len)]

In [None]:
test_df = texts_2_df(test_reviews, None, model)

In [None]:
X_test = test_df

In [None]:
def get_accuracy(test_labels, y_test):
  correct_count = 0
  total_count = 0
  for i in range(len(test_labels)):
    if test_labels[i] == y_test[i]:
      correct_count += 1
    total_count +=1
  return correct_count / total_count

# Naive Bayes

In [None]:
from sklearn import naive_bayes
from sklearn.metrics import precision_recall_fscore_support as score

In [None]:
model_naive_bayes = naive_bayes.GaussianNB()
model_naive_bayes.fit(X_train, y_train)

GaussianNB()

In [None]:
y_test_naive_bayes = model_naive_bayes.predict(X_test)

In [None]:
accuracy = get_accuracy(test_labels, y_test_naive_bayes)
print(f"Accuracy of Naive Bayes on test set: {accuracy}")

Accuracy of Naive Bayes on test set: 0.9868421052631579


In [None]:
precision, recall, fscore, support = score(test_labels, y_test_naive_bayes, pos_label=1, average = 'binary')
print('Precision : {} / Recall : {} / fscore : {} / Accuracy: {}'.format(round(precision,3),round(recall,3),round(fscore,3),round((test_labels==y_test_naive_bayes).sum()/len(y_test_naive_bayes),3)))

Precision : 0.99 / Recall : 0.912 / fscore : 0.95 / Accuracy: 0.987


# Random Forest

In [None]:
model_random_forest = RandomForestClassifier()
model_random_forest.fit(X_train, y_train)

RandomForestClassifier()

In [None]:
y_test_random_forest = model_random_forest.predict(X_test)

In [None]:
accuracy = get_accuracy(test_labels, y_test_random_forest)
print(f"Accuracy of Random Forest on test set: {accuracy}")

Accuracy of Random Forest on test set: 0.9832535885167464


In [None]:
precision, recall, fscore, support = score(test_labels, y_test_random_forest, pos_label=1, average = 'binary')
print('Precision : {} / Recall : {} / fscore : {} / Accuracy: {}'.format(round(precision,3),round(recall,3),round(fscore,3),round((test_labels==y_test_random_forest).sum()/len(y_test_random_forest),3)))

Precision : 1.0 / Recall : 0.877 / fscore : 0.935 / Accuracy: 0.983


#Logistic Regression

In [None]:
from sklearn import linear_model

In [None]:
model_logistic_regression = linear_model.LogisticRegression()
model_logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [None]:
y_test_logistic_regression = model_logistic_regression.predict(X_test)

In [None]:
accuracy = get_accuracy(test_labels, y_test_logistic_regression)
print(f"Accuracy of Logistic Regression on test set: {accuracy}")

Accuracy of Logistic Regression on test set: 0.993421052631579


In [None]:
precision, recall, fscore, support = score(test_labels, y_test_logistic_regression, pos_label=1, average = 'binary')
print('Precision : {} / Recall : {} / fscore : {} / Accuracy: {}'.format(round(precision,3),round(recall,3),round(fscore,3),round((test_labels==y_test_logistic_regression).sum()/len(y_test_logistic_regression),3)))

Precision : 0.995 / Recall : 0.956 / fscore : 0.975 / Accuracy: 0.993


# Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
model_neural_network = MLPClassifier()
model_neural_network.fit(X_train, y_train)

MLPClassifier()

In [None]:
y_test_neural_network = model_neural_network.predict(X_test)

In [None]:
accuracy = get_accuracy(test_labels, y_test_neural_network)
print(f"Accuracy of neural network on test set: {accuracy}")

Accuracy of neural network on test set: 0.9952153110047847


In [None]:
precision, recall, fscore, support = score(test_labels, y_test_neural_network, pos_label=1, average = 'binary')
print('Precision : {} / Recall : {} / fscore : {} / Accuracy: {}'.format(round(precision,3),round(recall,3),round(fscore,3),round((test_labels==y_test_neural_network).sum()/len(y_test_neural_network),3)))

Precision : 1.0 / Recall : 0.965 / fscore : 0.982 / Accuracy: 0.995


# Our Spam Messages

In [None]:
X_test = ["Nah I don't think he goes to usf, he lives around here though",
          "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
          "WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.",
          "K:)k.are you in college?",
          "Oops, I'll let you know when my roommate's done with the work",
          "I'm back, lemme know when you're ready",
          "Thanks for your subscription to Ringtone UK your mobile will be charged å£5/month Please confirm by replying YES or NO. If you reply NO you will not be charged",
          "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.",
          "URGENT! You have won a 1 week FREE membership in our å£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18",
          "Congratulations ur awarded 500 of CD vouchers or 125gift guaranteed & Free entry 2 100 wkly draw txt MUSIC to 87066 TnCs www.Ldew.com1win150ppmx3age16", 
          "Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/123456to claim now.",
          "Your IRS tax refund is pending acceptance. Must accept within 24 hours:http://bit.ly/sdfsdf.",
          "Amazon is sending you a refund of $32.64. Please reply with your bank account and routing number to receive your refund.",
          "Wells Fargo Bank: Your account is temporarily locked. Please log in at http://goo.gl/2a234 to secure your account.",
          "Apple Notification. Your Apple iCloud ID expires today. Log in to prevent deletion http://apple.id/user-auth/ online"]

y_spam = [0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1]
X_test = texts_2_df(X_test, None, model)

In [None]:
models = [model_naive_bayes, model_logistic_regression, model_random_forest, model_neural_network]
for m in models:
  y = m.predict(X_test)
  accuracy = get_accuracy(y_spam, y)
  print(f"Accuracy : {accuracy} for {m}")

Accuracy : 0.8666666666666667 for GaussianNB()
Accuracy : 1.0 for LogisticRegression()
Accuracy : 0.7333333333333333 for RandomForestClassifier()
Accuracy : 1.0 for MLPClassifier()


#Determine if spam

In [None]:
ind_to_label = {0: "Ham", 1: "Spam"}
print("Which classifier do you want to use?")
print("1 : Naive Bayes \n2 : Logistic Regression \n3 : Random Forest \n4 : Neural Network")
ind = input()
classifier = models[int(ind)-1]
X_test = input("Enter message to check if it is spam: ")
X_test = texts_2_df([str(X_test)], None, model)
y = classifier.predict(X_test)
print(ind_to_label[y[0]])

Which classifier do you want to use?
1 : Naive Bayes 
2 : Logistic Regression 
3 : Random Forest 
4 : Neural Network
