In [1]:
from src import svm as svm
from src import util as util
import numpy as np
import matplotlib.pyplot as plt
import collections
import csv

In [2]:
def load_spam_dataset_csv(csv_path):
    """Load the spam dataset from a TSV file

    Args:
         csv_path: Path to TSV file containing dataset.

    Returns:
        messages: A list of string values containing the text of each message.
        labels: The binary labels (0 or 1) for each message. A 1 indicates spam.
    """

    messages = []
    labels = []

    with open(csv_path, 'r', newline='', encoding='utf8') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')

        for message, label in reader:
            messages.append(message)
            labels.append(1 if label == '1' else 0)

    return messages, np.array(labels)

In [3]:
train_messages, train_labels = util.load_spam_dataset('data/ds6_train.tsv')
test_messages, test_labels = util.load_spam_dataset('data/ds6_test.tsv')
train2_messages, train2_labels = load_spam_dataset_csv('data/emails.csv')
val_messages, val_labels = util.load_spam_dataset('data/ds6_val.tsv')

In [4]:
def getwords(message):
    return message.lower().split()

In [5]:
def h(theta,x):
    return 1 / (1 + np.exp(-np.dot(x, theta)))

In [6]:
def Create_dictionary(messages):
    words = [word for message in messages for word in getwords(message)]
    words_count = collections.Counter(words)
    #word_dic = {(word, count) for word,count in words_count.items() if count>=5}
    freq_word = [word for word,count in words_count.items() if count>=10 and word !='subject:']
    return {word: count for count, word in enumerate(freq_word)}

In [7]:
dicWords = Create_dictionary(train_messages)

In [8]:
def Transform_text(messages, word_dictionary):
    m = len(messages)
    n = len(word_dictionary)
    matrix = np.zeros((m,n), dtype=int)

    word_counts = [collections.Counter(getwords(message)) for message in messages]
    for i in range(m):
        for word, count in word_counts[i].items():
            if word in word_dictionary:
                matrix[i][word_dictionary[word]] += count

    return matrix


In [9]:
train_matrix = Transform_text(train_messages,dicWords)

In [34]:
test_matrix = Transform_text(test_messages,dicWords)

In [35]:
train_matrix

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4457, 877))

In [36]:
X_with_bias = np.c_[np.ones(train_matrix.shape[0]),train_matrix]

In [81]:
class LogisticRegressionWithGD:
    def __init__(self,alpha=0.1,iteration=1000):
        self.alpha = alpha
        self.iteration=iteration
        self.theta=None
    def h(self,theta, x):
        return 1 / (1 + np.exp(-np.dot(x, theta)))
    def fit(self,x,y):
        n,m = x.shape
        X_with_bias = np.c_[np.ones(n),x]
        self.theta = np.zeros(m+1)
        #Gradient
        for i in range(self.iteration):
            linearmodel = np.dot(X_with_bias.T, (self.h(self.theta, X_with_bias)-y))/m
            self.theta = self.theta - self.alpha*linearmodel
        
        
    def predict(self,x):
        X_with_bias = np.c_[np.ones(x.shape[0]),x]
        y_predicted = 1/(1+np.exp(-X_with_bias.dot(self.theta)))
        y_result = [1 if i>0.5 else 0 for i in y_predicted]
        return y_result

In [142]:
LR = LogisticRegressionWithGD(alpha=0.01,iteration=1000)
LR.fit(train_matrix,train_labels)
LR_predictions = LR.predict(train_matrix)

In [143]:
LR_accuracy = np.mean(LR_predictions == train_labels)

In [144]:
print('SimpLe Logistic Regression had an accuracy of {} on the testing set'.format(LR_accuracy))

SimpLe Logistic Regression had an accuracy of 0.949293246578416 on the testing set


In [152]:
simpletesting=[
'this product is good check this url'
]

In [153]:
testing1_matrix = Transform_text(simpletesting,dicWords)

In [154]:

LR_predictions2 = LR.predict(testing1_matrix)
LR_predictions2

[0]

use Large dataset

In [155]:
vocabulary = Create_dictionary(train2_messages)

In [156]:
train2_matrix = Transform_text(train2_messages,vocabulary)

In [160]:
LR2 = LogisticRegressionWithGD(alpha=0.01,iteration=1000)
LR2.fit(train2_matrix,train2_labels)

In [161]:
LR_predictions2 = LR.predict(train2_matrix)
LR_accuracy2 = np.mean(LR_predictions == train2_labels)

In [162]:
print('SimpLe Logistic Regression with large dataset had an accuracy of {} on the testing set'.format(LR_accuracy2))

SimpLe Logistic Regression with large dataset had an accuracy of 0.9712343096234309 on the testing set


In [187]:
simpletesting2=[
'Claim your free money'
]


In [188]:
testing2 = Transform_text(simpletesting2,vocabulary)

In [189]:
resulttesting2 = LR.predict(testing2)
resulttesting2

[1]