In [106]:
from src import util as util
import numpy as np
import matplotlib.pyplot as plt
import collections
import csv

In [107]:
def load_spam_dataset_csv(csv_path):

    messages = []
    labels = []

    with open(csv_path, 'r', newline='', encoding='utf8') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')

        for message, label in reader:
            messages.append(message)
            labels.append(1 if label == '1' else 0)

    return messages, np.array(labels)

In [108]:
train_messages, train_labels = util.load_spam_dataset('data/train.tsv')
test_messages, test_labels = util.load_spam_dataset('data/test.tsv')
train2_messages, train2_labels = load_spam_dataset_csv('data/emails_new_train.csv')
val_messages, val_labels = util.load_spam_dataset('data/val.tsv')

In [109]:
def getwords(message):
    return message.lower().split()

In [110]:
def h(theta,x):
    return 1 / (1 + np.exp(-np.dot(x, theta)))

In [111]:
def Create_dictionary(messages):
    words = [word for message in messages for word in getwords(message)]
    words_count = collections.Counter(words)
    #word_dic = {(word, count) for word,count in words_count.items() if count>=5}
    freq_word = [word for word,count in words_count.items() if count>=10 and word !='subject:']
    return {word: count for count, word in enumerate(freq_word)}

In [112]:
dicWords = Create_dictionary(train_messages)

In [113]:
# All Import Statements Defined Here
# Note: Do not add to this list.
# ----------------

import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5

from platform import python_version
assert int(python_version().split(".")[1]) >= 5, "Please upgrade your Python version following the instructions in \
    the README.txt file found in the same directory as this notebook. Your Python version is " + python_version()

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pprint
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]

import nltk
nltk.download('reuters') #to specify download location, optionally add the argument: download_dir='/specify/desired/path/'
from nltk.corpus import reuters

import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

START_TOKEN = '<START>'
END_TOKEN = '<END>'

np.random.seed(0)
random.seed(0)
# ----------------

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [12]:
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): sorted list of distinct words across the corpus
            n_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    n_corpus_words = -1
    
    ### SOLUTION BEGIN
#     for docs in corpus:
#         for word in docs:
#             corpus_words.append(word)
    corpus_words = sorted(list(set([word for docs in corpus for word in docs])))
    n_corpus_words = len(corpus_words)
    ### SOLUTION END

    return corpus_words, n_corpus_words

In [13]:
def compute_co_occurrence_matrix(corpus, window_size=4):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).
    
        Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
              number of co-occurring words.
              
              For example, if we take the document "<START> All that glitters is not gold <END>" with window size of 4,
              "All" will co-occur with "<START>", "that", "glitters", "is", and "not".
    
        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (a symmetric numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): 
                Co-occurence matrix of word counts.
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    words, n_words = distinct_words(corpus)
    M = None
    word2ind = {}
    
    ### SOLUTION BEGIN
    M = np.zeros(shape=(n_words, n_words))
    word2ind = {w:i for i, w in enumerate(words)}
    for doc in corpus:
        doc_size = len(doc)
        for i, word in enumerate(doc):
            center = word2ind[word]
            start = 0 if i <= window_size else i - window_size
            end = doc_size if doc_size - i <= window_size + 1 else i + window_size + 1
            context = [word for cont in (doc[start:i], doc[i+1:end]) for word in cont]
            for w in context:
                out = word2ind[w]
                M[center][out] += 1
    ### SOLUTION END

    return M, word2ind

In [14]:
def reduce_to_k_dim(M, k=2):
    """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)
        to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
            - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
    
        Params:
            M (numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): co-occurence matrix of word counts
            k (int): embedding size of each word after dimension reduction
        Return:
            M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensinal word embeddings.
                    In terms of the SVD from math class, this actually returns U * S
    """    
    n_iters = 10     # Use this parameter in your call to `TruncatedSVD`
    M_reduced = None
    print("Running Truncated SVD over %i words..." % (M.shape[0]))
    
    ### SOLUTION BEGIN
    svd = TruncatedSVD(n_components=k, n_iter=n_iters)
    M_reduced = svd.fit_transform(M)
    ### SOLUTION END

    print("Done.")
    return M_reduced

In [15]:
def load_embedding_model():
    """ Load GloVe Vectors
        Return:
            wv_from_bin: All 400000 embeddings, each lengh 200
    """
    import gensim.downloader as api
    wv_from_bin = api.load("glove-twitter-100")
    print("Loaded vocab size %i" % len(list(wv_from_bin.index_to_key)))
    return wv_from_bin

In [16]:
wv_from_bin = load_embedding_model()

Loaded vocab size 1193514


In [17]:
len(wv_from_bin)

1193514

In [29]:
def get_matrix_of_vectors(wv_from_bin,bodies):
    """ Put the GloVe vectors into a matrix M.
        Param:
            wv_from_bin: KeyedVectors object; the 400000 GloVe vectors loaded from file
        Return:
            M: numpy matrix shape (num words, 200) containing the vectors
            word2ind: dictionary mapping each word to its row number in M
    """
    all_body_embeddings = []
    word_count=[]
    for body in bodies:
        body_embeddings = []
        words = body.split()  # Split body into words
        
        for word in words:
            try:
                # Get word vector (e.g., length 5)
                word_vec = wv_from_bin.get_vector(word)
                body_embeddings.append(word_vec.tolist())  # Convert numpy array to list
            except KeyError:
                # Skip words not in vocabulary
                continue
        
        if body_embeddings:  # Only add if we found embeddings
            all_body_embeddings.append(body_embeddings)
            word_count = len(body_embeddings)
         # Pad sequences to make them uniform (for true 3D array)
    max_words = max(word_count) if word_count else 0
    embedding_dim = wv_from_bin.vector_size
    
    # Create padded 3D array
    M = np.zeros((len(all_body_embeddings), max_words, embedding_dim))
    for i, body_emb in enumerate(all_body_embeddings):
        M[i, :word_count[i]] = body_emb  # Fill available words
    
            
    return M,word_count

In [101]:
def get_matrix_of_vectors(wv_from_bin,bodies):
    """ Put the GloVe vectors into a matrix M.
        Param:
            wv_from_bin: KeyedVectors object; the 400000 GloVe vectors loaded from file
        Return:
            M: numpy matrix shape (num words, 200) containing the vectors
            word2ind: dictionary mapping each word to its row number in M
    """
    all_body_embeddings = []
    word_count=[]
    for body in bodies:
        body_embeddings = []
        words = body.split()  # Split body into words
        
        for word in words:
            try:
                # Get word vector (e.g., length 5)
                word_vec = wv_from_bin.get_vector(word)
                body_embeddings.append(word_vec.tolist())  # Convert numpy array to list
            except KeyError:
                body_embeddings.append(np.zeros(100))
                continue
        
        if body_embeddings:  # Only add if we found embeddings
            all_body_embeddings.append(body_embeddings)
            word_count = len(body_embeddings)
         # Pad sequences to make them uniform (for true 3D array)
    max_words = max(word_count) if word_count else 0
    embedding_dim = wv_from_bin.vector_size
    
    # Create padded 3D array
    M = np.zeros((len(all_body_embeddings), max_words, embedding_dim))
    for i, body_emb in enumerate(all_body_embeddings):
        M[i, :word_count[i]] = body_emb  # Fill available words
    
            
    return M,word_count

In [None]:
from sklearn.decomposition import PCA
# 1. Get the matrix and counts
M, word_counts = get_matrix_of_vectors(wv_from_bin, train_messages)

# 2. Compute average embeddings
embedding_dim = M.shape[2]
avg_embeddings = np.zeros((len(train_messages), embedding_dim))

for i, (total_words, valid_words) in enumerate(word_counts):
    if valid_words > 0:
        avg_embeddings[i] = np.mean(M[i][:valid_words], axis=0)

# 3. Now apply PCA
pca = PCA(n_components=20)
reduced_embeddings = pca.fit_transform(avg_embeddings)

In [100]:
M.shape

(4457, 171, 100)

In [99]:
#for large data set
# 1. Get the matrix and counts
M_2, word_counts_2 = get_matrix_of_vectors(wv_from_bin, train2_messages)

# 2. Compute average embeddings
embedding_dim_2 = M.shape[2]
avg_embeddings_2 = np.zeros((len(train2_messages), embedding_dim_2))

for i, (total_words, valid_words) in enumerate(word_counts):
    if valid_words > 0:
        avg_embeddings[i] = np.mean(M[i][:valid_words], axis=0)

# 3. Now apply PCA
pca2 = PCA(n_components=20)
reduced_embeddings_2 = pca2.fit_transform(avg_embeddings)

MemoryError: Unable to allocate 72.0 GiB for an array with shape (10907, 8862, 100) and data type float64

In [57]:
def Transform_text(messages, word_dictionary):
    m = len(messages)
    n = len(word_dictionary)
    matrix = np.zeros((m,n), dtype=int)

    word_counts = [collections.Counter(getwords(message)) for message in messages]
    for i in range(m):
        for word, count in word_counts[i].items():
            if word in word_dictionary:
                matrix[i][word_dictionary[word]] += count

    return matrix


In [58]:
train_matrix = Transform_text(train_messages,dicWords)

In [59]:
train_matrix.shape

(4457, 877)

In [85]:
X_with_bias = np.c_[np.ones(reduced_embeddings.shape[0]),reduced_embeddings]

In [86]:
class LogisticRegressionWithGD:
    def __init__(self,alpha=0.1,iteration=1000):
        self.alpha = alpha
        self.iteration=iteration
        self.theta=None
    def h(self,theta, x):
        return 1 / (1 + np.exp(-np.dot(x, theta)))
    def fit(self,x,y):
        lambda_ = 1
        n,m = x.shape
        X_with_bias = np.c_[np.ones(n),x]
        self.theta = np.zeros(m+1)
        #Gradient
        for i in range(self.iteration):
            #With regularization
            linearmodel = (np.dot(X_with_bias.T, (self.h(self.theta, X_with_bias)-y))/m)
            L1_reg = np.sign(self.theta)*lambda_

            L2_reg = lambda_*self.theta
            linearmodel += L2_reg
            self.theta = self.theta - self.alpha*linearmodel
        
        
    def predict(self,x):
        X_with_bias = np.c_[np.ones(x.shape[0]),x]
        y_predicted = 1/(1+np.exp(-X_with_bias.dot(self.theta)))
        y_result = [1 if i>0.5 else 0 for i in y_predicted]
        return y_result

In [87]:
LR = LogisticRegressionWithGD(alpha=0.01,iteration=1000)
LR.fit(reduced_embeddings,train_labels)
LR_predictions = LR.predict(reduced_embeddings)

In [88]:
LR_accuracy = np.mean(LR_predictions == train_labels)

In [93]:
print('SimpLe Logistic Regression had an accuracy of {} on the testing set'.format(LR_accuracy))

SimpLe Logistic Regression had an accuracy of 0.8759255104330267 on the testing set


In [94]:
simpletesting=[
'buy on site this product now',
    'Get new phone free Click this URL',
"Don't forget to Subscribe our page in www.hh.com",
"Urgent your Account has been compromised. Verify your details now",
    "Making $5000 a week working from home. No Skills required",
"Your Package delivery failed. Click here to reschedule",
    "Act now this offer expires in 24 hours",
    "Get rich quick with this one simple trick",
    "Warning: your computer is as risk. Download this antivirus now "
]

In [95]:
testing1_matrix = Transform_text(simpletesting,dicWords)

In [96]:
testing1_matrix.shape

(9, 877)

use Large dataset

In [22]:
vocabulary = Create_dictionary(train2_messages)

In [23]:
train2_labels

array([0, 1, 1, ..., 0, 0, 1])

In [24]:
test2_matrix = Transform_text(test_messages,vocabulary)

In [25]:
train2_matrix = Transform_text(train2_messages,vocabulary)

In [26]:
LR2 = LogisticRegressionWithGD(alpha=0.01,iteration=1000)
LR2.fit(train2_matrix,train2_labels)

In [None]:
test2_matrix.shape

In [48]:
LR_predictions2 = LR2.predict(test2_matrix)
LR_accuracy2 = np.mean(LR_predictions2 == test_labels)

In [92]:
print('SimpLe Logistic Regression with large dataset had an accuracy of {} on the testing set'.format(LR_accuracy2))

NameError: name 'LR_accuracy2' is not defined

In [91]:
resulttesting2 = LR2.predict(testing2)
resulttesting2

NameError: name 'LR2' is not defined