In [157]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion

from sklearn.linear_model.logistic import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.utils import shuffle
from sklearn.metrics import precision_score, classification_report, accuracy_score

import time

In [158]:
start_overall_time = time.time()

In [159]:
def get_data():
    file_name = './SpamDetectionData.txt'
    rawdata = open(file_name, 'r')
    lines = rawdata.readlines()
    lines = lines[1:] #get rid of "header"
    spam_train = lines[0:1000]
    ham_train = lines[1002:2002]
    test_mix = lines[2004:]
    return (spam_train, ham_train, test_mix)


In [160]:
spam_train, ham_train, test_mix = get_data()

precent = 0.0035 #1.0 #0.0035
count = int(len(ham_train) * precent)
train_data = spam_train + ham_train[0:count]

In [161]:
def create_dataframe(input_array):    
    spam_indcator = 'Spam,<p>'
    message_class = np.array([1 if spam_indcator in item else 0 for item in input_array])
    data = pd.DataFrame()
    data['class'] = message_class
    data['message'] = input_array
    return data

In [162]:
df_train = create_dataframe(train_data)
df_test = create_dataframe(test_mix)

In [163]:
words_to_remove = ['Ham,<p>', 'Spam,<p>', '<p>', '</p>', '\n']

In [164]:
def remove_words(input_line, key_words=words_to_remove):
    temp = input_line
    for word in key_words:
        temp = temp.replace(word, '')
    return temp

In [165]:
def remove_words_and_shuffle(input_dataframe, input_random_state=7):
    input_dataframe['message'] = input_dataframe['message'].apply(remove_words)
    messages, classes = shuffle(input_dataframe['message'], input_dataframe['class'], random_state=input_random_state)
    df_return = pd.DataFrame()
    df_return['class'] = classes
    df_return['message'] = messages
    return df_return 

In [166]:
df_train_cleaned = remove_words_and_shuffle(df_train)
df_test_cleaned = remove_words_and_shuffle(df_test)

In [167]:
X_train_raw = df_train_cleaned['message']
y_train = df_train_cleaned['class']

X_test_raw = df_test_cleaned['message']
y_test = df_test_cleaned['class']

In [168]:
def output_accuracy(actual_y, predicted_y, model_name, train_time, predict_time):
    print('Model Name: ' + model_name)
    print('Train time: ', round(train_time, 2))
    print('Predict time: ', round(predict_time, 2))
    print('Model Accuracy: {:.4f}'.format(accuracy_score(actual_y, predicted_y)))
    print('Model Precision: {:.4f}'.format(precision_score(actual_y, predicted_y)))
    print('')
    print(classification_report(actual_y, predicted_y, digits=4))
    print("======================================================")

In [169]:
def test_models(X_train_input_raw, y_train_input, X_test_input_raw, y_test_input, models_dict):

    return_trained_models = {}
    
    #return_vectorizer = FeatureUnion([('count_vect', CountVectorizer()), ('tfidf_vect', TfidfVectorizer())])
    return_vectorizer = FeatureUnion([('tfidf_vect', TfidfVectorizer())])
    
    X_train = return_vectorizer.fit_transform(X_train_input_raw)
    X_test = return_vectorizer.transform(X_test_input_raw)
    
    for key in models_dict:
        model_name = key
        model = models_dict[key]
        t1 = time.time()
        model.fit(X_train, y_train_input)
        t2 = time.time()
        predicted_y = model.predict(X_test)
        t3 = time.time()
        
        output_accuracy(y_test_input, predicted_y, model_name, t2 - t1, t3 - t2)        
        return_trained_models[model_name] = model
        
    return (return_trained_models, return_vectorizer)

In [170]:
def create_models():
    models = {}
    models['LinearSVC'] = LinearSVC()
    models['LogisticRegression'] = LogisticRegression()
    models['RandomForestClassifier'] = RandomForestClassifier()
    models['DecisionTreeClassifier'] = DecisionTreeClassifier()
    return models

In [171]:
models = create_models()
trained_models, fitted_vectorizer = test_models(X_train_raw, y_train, X_test_raw, y_test, models)

Model Name: LinearSVC
Train time:  0.01
Predict time:  0.0
Model Accuracy: 1.0000
Model Precision: 1.0000

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000        57
          1     1.0000    1.0000    1.0000        43

avg / total     1.0000    1.0000    1.0000       100

Model Name: LogisticRegression
Train time:  0.01
Predict time:  0.0
Model Accuracy: 0.4300
Model Precision: 0.4300

             precision    recall  f1-score   support

          0     0.0000    0.0000    0.0000        57
          1     0.4300    1.0000    0.6014        43

avg / total     0.1849    0.4300    0.2586       100

Model Name: DecisionTreeClassifier
Train time:  0.02
Predict time:  0.0
Model Accuracy: 0.9800
Model Precision: 0.9556

             precision    recall  f1-score   support

          0     1.0000    0.9649    0.9821        57
          1     0.9556    1.0000    0.9773        43

avg / total     0.9809    0.9800    0.9800       100

Model Name: 

  'precision', 'predicted', average, warn_for)


In [172]:
stop_overall_time = time.time()

In [173]:
print("Overall time: ", round(stop_overall_time - start_overall_time, 2))

Overall time:  0.62


In [174]:
row = df_test[1:2]
test_msg = row['message']
test_class = row['class']

In [175]:
test_msg

1    Again on quaff nothing. It explore stood usby ...
Name: message, dtype: object

In [176]:
#test_msg = ['this is my test message']
transformed_test_msg = fitted_vectorizer.transform(test_msg)
prediction = trained_models['DecisionTreeClassifier'].predict(transformed_test_msg)

In [177]:
prediction

array([0])

In [178]:
test_class

1    0
Name: class, dtype: int64

In [179]:
# import out grid search module
from sklearn.model_selection import GridSearchCV


def get_best_model_and_accuracy(model, params, X, y):
    grid = GridSearchCV(model, # the model to grid search
                        params, # the parameter set to try 
                        error_score=0.) # if a parameter set raises an error, continue and set the performance as a big, fat 0
    grid.fit(X, y) # fit the model and parameters
    # our classical metric for performance
    print("Best Accuracy: {}".format(grid.best_score_))
    # the best parameters that caused the best accuracy
    print("Best Parameters: {}".format(grid.best_params_))
    # the average time it took a model to fit to the data (in seconds)
    print("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))
    # the average time it took a model to predict out of sample data (in seconds)
    # this metric gives us insight into how this model will perform in real-time analysis
    print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

In [180]:
j = [0, 1,2,3,4,5]
j[:3]

[0, 1, 2]

In [181]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        #index.append(filename)

    return DataFrame(rows) #, index=index)

data = DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory('./emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('./emails/ham', 'ham'))


In [182]:
spam = data[data['class'] == 'spam']['message']
ham = data[data['class'] == 'ham']['message']

In [183]:
#spam_msg = spam['message']

In [184]:
xxx = ['this is my test','my test','hey there']
transformed_test_xxxx = fitted_vectorizer.transform(xxx)
prediction_xxx = trained_models['DecisionTreeClassifier'].predict(transformed_test_xxxx)

In [185]:
prediction_xxx

array([1, 1, 1])

In [186]:
#prediction_spam

In [187]:
transformed_test_ham = fitted_vectorizer.transform(ham)
prediction_ham = trained_models['DecisionTreeClassifier'].predict(transformed_test_ham)

In [188]:
len(prediction_ham)

2500

In [189]:
transformed_test_spam = fitted_vectorizer.transform(ham)
prediction_spam = trained_models['DecisionTreeClassifier'].predict(transformed_test_spam)

In [190]:
ham[0:5]

0        Date:        Wed, 21 Aug 2002 10:54:46 -05...
1    Martin A posted:\n\nTassos Papadopoulos, the G...
2    Man Threatens Explosion In Moscow \n\n\n\nThur...
3    Klez: The Virus That Won't Die\n\n \n\nAlready...
4    >  in adding cream to spaghetti carbonara, whi...
Name: message, dtype: object

In [191]:
ham_zeros = np.zeros(len(prediction_ham))

In [192]:
ham_zeros

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [193]:
spam_ones = np.ones(len(prediction_spam))

In [194]:
spam_ones

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

In [195]:
all_actual_results = []
all_actual_results.extend(ham_zeros)
all_actual_results.extend(spam_ones) 

all_predict_results = []
all_predict_results.extend(prediction_spam)
all_predict_results.extend(prediction_ham) 

In [196]:
len(all_actual_results)

5000

In [197]:
output_accuracy(all_actual_results, all_predict_results, 'my name', time.time() - time.time(), time.time() - time.time())

Model Name: my name
Train time:  0.0
Predict time:  0.0
Model Accuracy: 0.5000
Model Precision: 0.5000

             precision    recall  f1-score   support

        0.0     0.5000    0.0012    0.0024      2500
        1.0     0.5000    0.9988    0.6664      2500

avg / total     0.5000    0.5000    0.3344      5000



In [198]:
models = create_models()
trained_models, fitted_vectorizer = test_models(X_train_raw, y_train, X_test_raw, y_test, models)

Model Name: LinearSVC
Train time:  0.0
Predict time:  0.0
Model Accuracy: 1.0000
Model Precision: 1.0000

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000        57
          1     1.0000    1.0000    1.0000        43

avg / total     1.0000    1.0000    1.0000       100

Model Name: LogisticRegression
Train time:  0.01
Predict time:  0.0
Model Accuracy: 0.4300
Model Precision: 0.4300

             precision    recall  f1-score   support

          0     0.0000    0.0000    0.0000        57
          1     0.4300    1.0000    0.6014        43

avg / total     0.1849    0.4300    0.2586       100

Model Name: DecisionTreeClassifier
Train time:  0.02
Predict time:  0.0
Model Accuracy: 0.9200
Model Precision: 0.8431

             precision    recall  f1-score   support

          0     1.0000    0.8596    0.9245        57
          1     0.8431    1.0000    0.9149        43

avg / total     0.9325    0.9200    0.9204       100

Model Name: R

  'precision', 'predicted', average, warn_for)


In [202]:
samples = ['free viagra','this is my test message']
transformed_samples = fitted_vectorizer.transform(samples)
trained_models['DecisionTreeClassifier'].predict(transformed_samples)

array([1, 1])