In [None]:
# Load packages
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
from sklearn.utils import shuffle
# from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.metrics import f1_score
from nltk.corpus import stopwords
import pandas as pd

from nltk.stem.porter import PorterStemmer
import string
import warnings
warnings.filterwarnings("ignore")

import nltk
nltk.download('stopwords')

def text_preprocess(mess):
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    nopunc = nopunc.lower()

    # Now just remove any stopwords and non alphabets
    nostop = [word for word in nopunc.split() if word.lower() not in stopwords.words('english') and word.isalpha()]
    return nostop

# load dataset
msgs_all = pd.read_csv('super23.csv',encoding='latin-1')
# msgs_all = pd.read_excel('punny.xlsx')
# msgs_all = msgs_all.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
msgs_all.columns = ["label","message"]

# change category labels
msgs_all['label'] = msgs_all['label'].map({1:1,0:-1})
# create a new dataset with only spam category data
# msgs_spam = msgs_all.loc[0:53854, :]
# print(len(msgs_spam))
msgs_all["message"] = msgs_all["message"].apply(text_preprocess)
msgs_all["message"] = msgs_all["message"].agg(lambda x: ' '.join(map(str, x)))
print (msgs_all.head(3))

In [None]:
msgs_spam = msgs_all.loc[37816:65736, :] #0:42627, 10658:53286, 0:52812
msgs_spam = msgs_spam[msgs_spam['label'] == 1]
print(msgs_spam.shape)

# msgs_all["message"] = msgs_all["message"].apply(text_preprocess)
# msgs_all["message"] = msgs_all["message"].agg(lambda x: ' '.join(map(str, x)))

# create train and test data
train_text = msgs_spam['message'].tolist()
train_labels = msgs_spam['label'].tolist()

# test_text = msgs_all['message'].tolist()
# test_labels = msgs_all['label'].tolist()
import time
start = time.time()
# Be considered for Ngram
vectorizer = TfidfVectorizer(encoding = "latin-1", strip_accents = "unicode", ngram_range=(1, 1))
# Be considered for BoW
# vectorizer = TfidfVectorizer(encoding = "latin-1", strip_accents = "unicode")
# features = vectorizer.fit_transform(train_text)
# print(features.shape)

# OneClassSVM algorithm
clf = OneClassSVM(nu=0.2, kernel="rbf", gamma=0.1)
clf = Pipeline([('vectorizer', vectorizer), ('clf', clf)])
# For details on Pipeline, https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf

# fit OneClassSVM model
clf.fit(train_text, train_labels)

stop = time.time()
print(f"Trg time: {stop - start}s")

In [None]:
msgs_all = pd.read_csv('super23.csv',encoding='latin-1')
# msgs_all = pd.read_excel('punny.xlsx')
# msgs_all = msgs_all.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
msgs_all.columns = ["label","message"]
msgs_all.head(3)

In [None]:
msgs_test = msgs_all.loc[0:37815, :] # 42627:53286, 0:10657, 52812:53286

msgs_test["message"] = msgs_test["message"].apply(text_preprocess)
msgs_test["message"] = msgs_test["message"].agg(lambda x: ' '.join(map(str, x)))

test_text = msgs_test['message'].tolist()
test_labels = msgs_test['label'].tolist()
import time
start = time.time()
# validate OneClassSVM model with test set
preds_test = clf.predict(test_text)
# print(preds_test)
# stop = time.time()
# print(f"Testing time: {stop - start}s")

results = confusion_matrix(test_labels, preds_test)
print('Confusion Matrix :', results)
# print('Precision:', precision_score(test_labels, preds_test))
# print('Recall:', recall_score(test_labels, preds_test))
# print('F1-Score :',f1_score(test_labels, preds_test))
print('Accuracy Score :',accuracy_score(test_labels, preds_test))
# print('Report : ', classification_report(test_labels, preds_test))