In [17]:
from bs4 import BeautifulSoup
from collections import Counter
import os
import sys
import urlextract
import re
import nltk
import numpy as np

In [18]:
def remove_urls(text):
    url_extractor = urlextract.URLExtract()
    urls = []
    raw_urls = []
    try:
        raw_urls = url_extractor.find_urls(text)
    except:
        dummy = 3
    for url in raw_urls:
        urls.append(str(url))
    urls.sort(key=lambda url: len(url), reverse=True)
    for url in urls:
        text = text.replace(url, " URL ")
    return text

def remove_nums(text):
    text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', text)
    return text

def remove_punctuation(text):
    text = re.sub(r'\W+', ' ', text, flags=re.M)
    return text

def stemm_words(text):
    stemmer = nltk.PorterStemmer()
    for word in text.split():
        changed = stemmer.stem(word)
        if(changed != word):
            text = text.replace(word, changed)
    return text

def create_vocabulary(data):
    vocabulary = []
    for idx, sample in enumerate(data):
        for word in sample.split():
            vocabulary.append(word)
    vocabulary = set(vocabulary)
    return vocabulary

def transform_to_vector(text, vocabulary):
    counter = Counter()
    vector = []
    
    for word in vocabulary:
        counter[word] = 0
    
    for word in text:
        counter[word] += 1

    for key, value in counter.items():
        vector.append(value)
        if(len(vector) == len(vocabulary)):
            break
    vector = np.asarray(vector)
    return vector

def samples_to_vectors(texts, vocabulary):
    vectors = []
    for text in texts:
        vectors.append(transform_to_vector(text, vocabulary))

    return vectors

def get_samples_from_file(path, limit = 9999999):
    data = []
    idx = 0
    for file in os.listdir(path):
        current = os.path.join(path, file)
        soap = BeautifulSoup(open(current, "rb"))
        text = soap.get_text()
        text = remove_urls(text)
        text = remove_nums(text)
        text = remove_punctuation(text)
        text = text.lower()
        text = stemm_words(text)
        data.append(text)
        idx = idx + 1
        if(idx >= limit):
            break 
    return data

In [27]:
spam_data = get_samples_from_file('rsc/spam')
eham_data = get_samples_from_file('rsc/easy_ham')



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [28]:
spam_size = len(spam_data)
eham_size = len(eham_data)

all_data = spam_data.copy()
all_data.extend(eham_data)

print(len(all_data))
print(len(spam_data))
print(len(eham_data))

3052
501
2551


In [29]:
vocabulary = create_vocabulary(all_data)
print(len(vocabulary))

88395


In [30]:
X = samples_to_vectors(all_data, vocabulary)
y = np.array([0] * spam_size + [1] * eham_size)

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
score = cross_val_score(log_clf, np.asarray(X_train), y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................




[CV] ....................... , score=0.9201474201474201, total=  21.5s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   21.5s remaining:    0.0s


[CV] ....................... , score=0.9324324324324325, total=  22.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   43.7s remaining:    0.0s


[CV] ....................... , score=0.9384993849938499, total=  27.5s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min finished


0.9303597458579008

In [33]:
from sklearn.metrics import precision_score, recall_score

log_clf.fit(np.asarray(X_train), y_train)

y_pred = log_clf.predict(np.asarray(X_test))

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))



Precision: 92.83%
Recall: 96.38%
