# Logistic Regression Model for Spam Detection

In [167]:
import spacy
from spacy.lang.en import English
from scipy import sparse
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import os

### Defining Methods

In [168]:
def read_examples(filename):
    X = []
    Y = []
    with open(filename, mode = 'r', encoding = 'utf-8') as file:
        for line in file:
            [label, text] = line.rstrip().split(' ', maxsplit = 1)
            X.append(text)
            Y.append(label)
    return X, Y

In [169]:
def read_examples2(filename):
    X = []
    Y = []
    with open(filename, mode = 'r', encoding = 'utf-8') as file:
        for line in file:
            [label, text] = line.rstrip().split('\t', maxsplit = 1)
            X.append(text)
            Y.append(label)
    return X, Y

In [170]:
spacy_nlp = English()

def spacy_tokenizer(text):
    tokens = spacy_nlp.tokenizer(text)
    
    return [token.text for token in tokens]

In [171]:
def features_to_ids(examples, feature_vocab):
    new_examples = sparse.lil_matrix((len(examples), len(feature_vocab)))
    for idx, example in enumerate(examples):
        for feat in example:
            if feat in feature_vocab:
                new_examples[idx, feature_vocab[feat]] = example[feat]
                
    return new_examples

In [172]:
def docs2features(trainX, feature_functions, tokenizer):
    examples = []
    count = 0
    for doc in trainX:
        feats = {}

        tokens = tokenizer(doc)
        
        for func in feature_functions:
            add_features(feats, func(tokens))

        examples.append(feats)
        count +=1
        
        if count % 100 == 0:
            print('Processed %d examples into features' % len(examples))
    
    return examples

In [173]:
# Evaluation pipeline for the Logistic Regression classifier.

def train_and_test(trainX, trainY, devX, devY, feature_functions, tokenizer):
    # Pre-process training documents. 
    trainX_feat = docs2features(trainX, feature_functions, tokenizer)

    # Create vocabulary from features in training examples.
    feature_vocab = create_vocab(trainX_feat)
    print('Size: %d' % len(feature_vocab))

    trainX_ids = features_to_ids(trainX_feat, feature_vocab)
    
    # Train LR model.
    lr_model = LogisticRegression(penalty = 'l2', C = 1.0, solver = 'lbfgs', max_iter = 1000)
    lr_model.fit(trainX_ids, trainY)
    
    # Pre-process test documents. 
    devX_feat = docs2features(devX, feature_functions, tokenizer)
    devX_ids = features_to_ids(devX_feat, feature_vocab)
    
    # Test LR model.
    print('Accuracy: %.3f' % lr_model.score(devX_ids, devY))

In [174]:
def word_features(tokens):
    feats = {}
    for word in tokens:
        feat = 'WORD_%s' % word
        if feat in feats:
            feats[feat] +=1
        else:
            feats[feat] = 1
    return feats

### Model for Instagram Comments

In [175]:
datapath = "Datasets/instagram/"

train_file = os.path.join(datapath, 'train.txt')
trainX, trainY = read_examples(train_file)
print("Training examples:", label_counts(trainY))

dev_file = os.path.join(datapath, 'dev.txt')
devX, devY = read_examples(dev_file)
print("Development examples:", label_counts(devY))


test_file = os.path.join(datapath, 'test.txt')
testX, testY = read_examples(test_file)
print("Test examples:", label_counts(testY))

Training examples: {'spam': 120, 'ham': 124}
Development examples: {'spam': 12, 'ham': 11}
Test examples: {'spam': 11, 'ham': 11}


In [113]:
# Specify features to use.
features = [word_features]

# Evaluate LR model.
train_and_test(trainX, trainY, devX, devY, features, spacy_tokenizer)

Processed 100 examples into features
Processed 200 examples into features
Size: 1302
Accuracy: 0.783


### Model for Emails

In [114]:
datapath = "Datasets/email/"

train_file = os.path.join(datapath, 'train.txt')
trainX, trainY = read_examples2(train_file)
print("Training examples:", label_counts(trainY))

dev_file = os.path.join(datapath, 'dev.txt')
devX, devY = read_examples2(dev_file)
print("Development examples:", label_counts(devY))


test_file = os.path.join(datapath, 'test.txt')
testX, testY = read_examples2(test_file)
print("Test examples:", label_counts(testY))

Training examples: {'ham': 2385, 'spam': 476}
Development examples: {'ham': 49, 'spam': 10}
Test examples: {'spam': 19, 'ham': 68}


In [115]:
# Specify features to use.
features = [word_features]

# Evaluate LR model.
train_and_test(trainX, trainY, devX, devY, features, spacy_tokenizer)

Processed 100 examples into features
Processed 200 examples into features
Processed 300 examples into features
Processed 400 examples into features
Processed 500 examples into features
Processed 600 examples into features
Processed 700 examples into features
Processed 800 examples into features
Processed 900 examples into features
Processed 1000 examples into features
Processed 1100 examples into features
Processed 1200 examples into features
Processed 1300 examples into features
Processed 1400 examples into features
Processed 1500 examples into features
Processed 1600 examples into features
Processed 1700 examples into features
Processed 1800 examples into features
Processed 1900 examples into features
Processed 2000 examples into features
Processed 2100 examples into features
Processed 2200 examples into features
Processed 2300 examples into features
Processed 2400 examples into features
Processed 2500 examples into features
Processed 2600 examples into features
Processed 2700 exampl

### Model for SMS 

In [165]:
datapath = "Datasets/sms/"

train_file = os.path.join(datapath, 'train.txt')
trainX, trainY = read_examples2(train_file)
print("Training examples:", label_counts(trainY))

dev_file = os.path.join(datapath, 'dev2.txt')
devX, devY = read_examples2(dev_file)
print("Development examples:", label_counts(devY))

test_file = os.path.join(datapath, 'test.txt')
testX, testY = read_examples2(test_file)
print("Test examples:", label_counts(testY))

Training examples: {'ham': 3891, 'spam': 609}
Development examples: {'spam': 37, 'ham': 221}
Test examples: {'spam': 27, 'ham': 169}


In [166]:
# Specify features to use.
features = [word_features]

# Evaluate LR model.
train_and_test(trainX, trainY, devX, devY, features, spacy_tokenizer)

Processed 100 examples into features
Processed 200 examples into features
Processed 300 examples into features
Processed 400 examples into features
Processed 500 examples into features
Processed 600 examples into features
Processed 700 examples into features
Processed 800 examples into features
Processed 900 examples into features
Processed 1000 examples into features
Processed 1100 examples into features
Processed 1200 examples into features
Processed 1300 examples into features
Processed 1400 examples into features
Processed 1500 examples into features
Processed 1600 examples into features
Processed 1700 examples into features
Processed 1800 examples into features
Processed 1900 examples into features
Processed 2000 examples into features
Processed 2100 examples into features
Processed 2200 examples into features
Processed 2300 examples into features
Processed 2400 examples into features
Processed 2500 examples into features
Processed 2600 examples into features
Processed 2700 exampl