In [1]:
#importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from wordcloud import WordCloud, STOPWORDS
import sys, os


#Creating a dataframe
df1 = pd.read_csv('train_cleaned-2.csv')

#Applying the lambda function to categorize the label
def categorize_label(label):
    if label == 1:
        return 'spam'
    else:
        return 'not spam'
    
df1['label'] = df1['label'].apply(categorize_label)
#Droped unnecessary column
df1.drop('cleaned-subject', axis=1, inplace=True)

# Converting the label columns' values to spam=1 and not spam=0
df1['spam'] = df1['label'].apply(lambda x: 1 if x=='spam' else 0)
df1.head()
df1.rename(columns={'cleaned-text':'Text'}, inplace=True)
df1.update(df1['Text'])

df1.dropna(inplace=True)
X_train,X_test,y_train,y_test = train_test_split(df1.Text,df1.spam,test_size=0.25)    

countvector = CountVectorizer()
x_train_ct = countvector.fit_transform(X_train.values)
x_train_ct.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [2]:
# We will use Multinomial Naive Bayes as our model
modelMB = MultinomialNB()
modelMB.fit(x_train_ct, y_train)

# Pre-test notspam
email_notspam = ['hey wanna get some coffee?']
email_notspam_count = countvector.transform(email_notspam)
modelMB.predict(email_notspam_count)

array([0], dtype=int64)

In [3]:
# Pre-test spam
email_spam = ['reward click claim prize free money coupon discount bonus offer only for you 100 Earn back order now limited time offer 50 off today only']
email_spam_count = countvector.transform(email_spam)
email_spam_count
modelMB.predict(email_spam_count)

array([1], dtype=int64)

In [4]:
# Testing the model 
x_test_count = countvector.transform(X_test)
naive_bayes_accuracy = modelMB.score(x_test_count, y_test)

y_pred = modelMB.predict(x_test_count)

# Using Random Forest Classsifier as our model
rfc = RandomForestClassifier(n_estimators=100, criterion='entropy')
rfc.fit(x_train_ct, y_train)
rfc.score(x_test_count, y_test)
rfc.predict(email_spam_count)
random_forest_accuracy = accuracy_score(y_test, rfc.predict(x_test_count))

y_pred = rfc.predict(x_test_count)

# Using Support Vector Machine as our model
svm_model = SVC()
svm_model.fit(x_train_ct, y_train)
x_test_count = countvector.transform(X_test)
svm_predictions = svm_model.predict(x_test_count)
svm_accuracy = accuracy_score(y_test, svm_predictions)

y_pred = svm_model.predict(x_test_count)

In [5]:
voting_cf = VotingClassifier(estimators=[('nb', modelMB), ('rf', rfc), ('svm', svm_model)], voting='hard')
voting_cf.fit(x_train_ct, y_train)
voting_cf.score(x_test_count, y_test)
voting_cf_accuracy = accuracy_score(y_test, voting_cf.predict(x_test_count))

In [6]:
voting_cf_accuracy

0.9395429859616764

In [None]:
#importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from wordcloud import WordCloud, STOPWORDS
import sys, os


#Creating a dataframe
df1 = pd.read_csv('train_cleaned-2.csv')

#Applying the lambda function to categorize the label
def categorize_label(label):
    if label == 1:
        return 'spam'
    else:
        return 'not spam'
    
df1['label'] = df1['label'].apply(categorize_label)
#Droped unnecessary column
df1.drop('cleaned-subject', axis=1, inplace=True)

# Converting the label columns' values to spam=1 and not spam=0
df1['spam'] = df1['label'].apply(lambda x: 1 if x=='spam' else 0)
df1.head()
df1.rename(columns={'cleaned-text':'Text'}, inplace=True)
df1.update(df1['Text'])

df1.dropna(inplace=True)

In [23]:
import torch 
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

bert_accuracy = 0.97
naive_accuracy = 0.93
svm_accuracy = 0.89
random_accuracy = 0.94

def voting_system(bert_label, naive_label, svm_label, random_label):
    amount_of_models = 4
    vote_threshold = (0.5) ** (amount_of_models ** 0.5)
    
    weighted_vote = (bert_label * bert_accuracy +
                     naive_label * naive_accuracy +
                     svm_label * svm_accuracy +
                     random_label * random_accuracy)
    
    return 1 if weighted_vote >= vote_threshold else 0

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load('bert_model.pt'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load training data
X_train, X_test, y_train, y_test = train_test_split(df1.Text, df1.spam, test_size=0.25)

# Initialize and fit CountVectorizer on training data
countvector = CountVectorizer()
x_train_ct = countvector.fit_transform(X_train.values)
x_train_ct.toarray()

# Pre-test spam
email_spam = ['reward click claim prize free money coupon discount bonus offer only for you 100 Earn back order now limited time offer 50 off today only']
email_spam_count = countvector.transform(email_spam)

modelMB = MultinomialNB()
modelMB.fit(x_train_ct, y_train)

svm_model = SVC()
svm_model.fit(x_train_ct, y_train)

rfc = RandomForestClassifier(n_estimators=100, criterion='entropy')
rfc.fit(x_train_ct, y_train)

naive_label = modelMB.predict(email_spam_count)
svm_label = svm_model.predict(email_spam_count)
random_forest_label = rfc.predict(email_spam_count)

# Convert to torch tensor and move to the device
input_ids = torch.tensor(email_spam_count.toarray()).to(device)

max_seq_length = 512
input_ids = input_ids[:, :max_seq_length]

# Make prediction with BERT
model.eval()
with torch.no_grad():
    outputs = model(input_ids)
    bert_label = torch.argmax(outputs.logits, dim=1).item()

# Combine predictions using the voting system
final_label = voting_system(bert_label, naive_label, svm_label, random_forest_label)

print(final_label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1
