In [30]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion

from sklearn.linear_model.logistic import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.utils import shuffle
from sklearn.metrics import precision_score, classification_report, accuracy_score

import time

In [31]:
start_overall_time = time.time()

In [32]:
def get_data():
    file_name = './SpamDetectionData.txt'
    rawdata = open(file_name, 'r')
    lines = rawdata.readlines()
    lines = lines[1:] #get rid of "header"
    spam_train = lines[0:1000]
    ham_train = lines[1002:2002]
    test_mix = lines[2004:]
    return (spam_train, ham_train, test_mix)


In [33]:
spam_train, ham_train, test_mix = get_data()

precent = 1.0 #0.0035
count = int(len(ham_train) * precent)
train_data = spam_train + ham_train[0:count]

In [34]:
def create_dataframe(input_array):    
    spam_indcator = 'Spam,<p>'
    message_class = np.array([1 if spam_indcator in item else 0 for item in input_array])
    data = pd.DataFrame()
    data['class'] = message_class
    data['message'] = input_array
    return data

In [35]:
df_train = create_dataframe(train_data)
df_test = create_dataframe(test_mix)

In [36]:
df_test.head(5)

Unnamed: 0,class,message
0,0,"Ham,<p>Bust by this expressing at stepped and...."
1,0,"Ham,<p>Again on quaff nothing. It explore stoo..."
2,0,"Ham,<p>Tell floor perched. Doubting curious of..."
3,0,"Ham,<p>Angels nameless caught thrilled mefille..."
4,1,"Spam,<p>So his chaste my. Mote way fabled as o..."


In [37]:
words_to_remove = ['Ham,<p>', 'Spam,<p>', '<p>', '</p>', '\n']

In [38]:
def remove_words(input_line, key_words=words_to_remove):
    temp = input_line
    for word in key_words:
        temp = temp.replace(word, '')
    return temp

In [39]:
def remove_words_and_shuffle(input_dataframe, input_random_state=7):
    input_dataframe['message'] = input_dataframe['message'].apply(remove_words)
    messages, classes = shuffle(input_dataframe['message'], input_dataframe['class'], random_state=input_random_state)
    df_return = pd.DataFrame()
    df_return['class'] = classes
    df_return['message'] = messages
    return df_return 

In [40]:
df_train_cleaned = remove_words_and_shuffle(df_train)
df_test_cleaned = remove_words_and_shuffle(df_test)

In [41]:
X_train_raw = df_train_cleaned['message']
y_train = df_train_cleaned['class']

X_test_raw = df_test_cleaned['message']
y_test = df_test_cleaned['class']

In [42]:
def output_accuracy(actual_y, predicted_y, model_name, train_time, predict_time):
    print('Model Name: ' + model_name)
    print('Train time: ', round(train_time, 2))
    print('Predict time: ', round(predict_time, 2))
    print('Model Accuracy: {:.4f}'.format(accuracy_score(actual_y, predicted_y)))
    print('Model Precision: {:.4f}'.format(precision_score(actual_y, predicted_y)))
    print('')
    print(classification_report(actual_y, predicted_y, digits=4))
    print("=========================================================================")

In [43]:
def test_models(X_train_input_raw, y_train_input, X_test_input_raw, y_test_input, models_dict):

    return_trained_models = {}
    
    return_vectorizer = FeatureUnion([('count_vect', CountVectorizer()), ('tfidf_vect', TfidfVectorizer())])
    
    X_train = return_vectorizer.fit_transform(X_train_input_raw)
    X_test = return_vectorizer.transform(X_test_input_raw)
    
    for key in models_dict:
        model_name = key
        model = models_dict[key]
        t1 = time.time()
        model.fit(X_train, y_train_input)
        t2 = time.time()
        predicted_y = model.predict(X_test)
        t3 = time.time()
        
        output_accuracy(y_test_input, predicted_y, model_name, t2 - t1, t3 - t2)        
        return_trained_models[model_name] = model
        
    return (return_trained_models, return_vectorizer)

In [44]:
def create_models():
    models = {}
    models['LinearSVC'] = LinearSVC()
    models['LogisticRegression'] = LogisticRegression()
    models['RandomForestClassifier'] = RandomForestClassifier()
    models['DecisionTreeClassifier'] = DecisionTreeClassifier()
    return models

In [45]:
models = create_models()
trained_models, fitted_vectorizer = test_models(X_train_raw, y_train, X_test_raw, y_test, models)

Model Name: LogisticRegression
Train time:  0.09
Predict time:  0.0
Model Accuracy: 1.0000
Model Precision: 1.0000

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000        57
          1     1.0000    1.0000    1.0000        43

avg / total     1.0000    1.0000    1.0000       100

Model Name: LinearSVC
Train time:  0.03
Predict time:  0.0
Model Accuracy: 1.0000
Model Precision: 1.0000

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000        57
          1     1.0000    1.0000    1.0000        43

avg / total     1.0000    1.0000    1.0000       100

Model Name: RandomForestClassifier
Train time:  0.06
Predict time:  0.0
Model Accuracy: 1.0000
Model Precision: 1.0000

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000        57
          1     1.0000    1.0000    1.0000        43

avg / total     1.0000    1.0000    1.0000       100

Model Name: 

In [46]:
stop_overall_time = time.time()

In [47]:
print("Overall time: ", round(stop_overall_time - start_overall_time, 2))

Overall time:  2.01


In [48]:
row = df_test[1:2]
test_msg = row['message']
test_class = row['class']

In [49]:
test_msg

1    Again on quaff nothing. It explore stood usby ...
Name: message, dtype: object

In [50]:
#test_msg = ['this is my test message']
transformed_test_msg = fitted_vectorizer.transform(test_msg)
prediction = trained_models['DecisionTreeClassifier'].predict(transformed_test_msg)

In [51]:
prediction

array([0])

In [52]:
test_class

1    0
Name: class, dtype: int64

In [53]:
# import out grid search module
from sklearn.model_selection import GridSearchCV


def get_best_model_and_accuracy(model, params, X, y):
    grid = GridSearchCV(model, # the model to grid search
                        params, # the parameter set to try 
                        error_score=0.) # if a parameter set raises an error, continue and set the performance as a big, fat 0
    grid.fit(X, y) # fit the model and parameters
    # our classical metric for performance
    print("Best Accuracy: {}".format(grid.best_score_))
    # the best parameters that caused the best accuracy
    print("Best Parameters: {}".format(grid.best_params_))
    # the average time it took a model to fit to the data (in seconds)
    print("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))
    # the average time it took a model to predict out of sample data (in seconds)
    # this metric gives us insight into how this model will perform in real-time analysis
    print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

In [54]:
j = [0, 1,2,3,4,5]
j[:3]

[0, 1, 2]

In [55]:
df_test.head(5)

Unnamed: 0,class,message
0,0,Bust by this expressing at stepped and. My my ...
1,0,Again on quaff nothing. It explore stood usby ...
2,0,Tell floor perched. Doubting curious of only b...
3,0,Angels nameless caught thrilled mefilled. Till...
4,1,So his chaste my. Mote way fabled as of aye fr...


In [68]:
spam_train[0]

'Spam,<p>But could then once pomp to nor that glee glorious of deigned. The vexed times childe none native. To he vast now in to sore nor flow and most fabled. The few tis to loved vexed and all yet yea childe. Fulness consecrate of it before his a a a that.</p><p>Mirthful and and pangs wrong. Objects isle with partings ancient made was are. Childe and gild of all had to and ofttimes made soon from to long youth way condole sore.</p>\n'

In [70]:
#from the sample ham and spam
ham = 'door beguiling cushions did. Evermore from raven from is beak shall name'
spam = 'The vexed times childe none native'
test_messages = [spam, ham]
transformed_test_messages = fitted_vectorizer.transform(test_messages)
trained_models['DecisionTreeClassifier'].predict(transformed_test_messages)

array([1, 0])