In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, classification_report, accuracy_score

from sklearn.pipeline import FeatureUnion


In [2]:
def get_data():
    file_name = './SpamDetectionData.txt'
    rawdata = open(file_name, 'r')
    lines = rawdata.readlines()
    lines = lines[1:] #get rid of "header"
    spam_train = lines[0:1000]
    ham_train = lines[1002:2002]
    test_mix = lines[2004:]
    return (spam_train, ham_train, test_mix)


In [3]:
spam_train, ham_train, test_mix = get_data()


In [4]:
count = int(len(ham_train) * 0.2)

In [5]:
train_data = spam_train + ham_train[0:4]


In [6]:
def create_dataframe(input_array):    
    spam_indcator = 'Spam,<p>'
    message_class = np.array([1 if spam_indcator in item else 0 for item in input_array])
    data = pd.DataFrame()
    data['class'] = message_class
    data['message'] = input_array
    return data

In [7]:
df_train = create_dataframe(train_data)


In [8]:
df_test = create_dataframe(test_mix)


In [9]:
words_to_remove = ['Ham,<p>', 'Spam,<p>', '<p>', '</p>', '\n']


In [10]:
def remove_words(input_line, key_words=words_to_remove):
    temp = input_line
    for word in key_words:
        temp = temp.replace(word, '')
    return temp


In [11]:
def remove_words_and_shuffle(input_dataframe):
    input_dataframe['message'] = input_dataframe['message'].apply(remove_words)
    messages, classes = shuffle(input_dataframe['message'], input_dataframe['class'], random_state=7)
    df_return = pd.DataFrame()
    df_return['class'] = classes
    df_return['message'] = messages
    return df_return 


In [12]:
df_train_cleaned = remove_words_and_shuffle(df_train)


In [13]:
df_test_cleaned = remove_words_and_shuffle(df_test)


In [14]:
X_train_raw = df_train_cleaned['message']
y_train = df_train_cleaned['class']

In [15]:
X_test_raw = df_test_cleaned['message']
y_test = df_test_cleaned['class']

In [16]:
def output_accuracy(actual_y, predicted_y, model_name):
    print("Model Name: " + model_name)
    print("Test accuracy: {:.4f}".format(accuracy_score(actual_y, predicted_y)))
    print("Test precision: {:.4f}".format(precision_score(actual_y, predicted_y)))
    print("")
    print(classification_report(actual_y, predicted_y, digits=4))
    print("=========================================================================")

In [17]:
def test_models(X_train_input_raw, y_train_input, X_test_input_raw, y_test_input, models_dict):

    return_trained_models = {}
    
    #return_vectorizer = TfidfVectorizer()
    return_vectorizer = FeatureUnion([('tfidf_vect', TfidfVectorizer()), ('count_vect', CountVectorizer(min_df=.05,max_df=.8))])
    
    X_train = return_vectorizer.fit_transform(X_train_input_raw)
    X_test = return_vectorizer.transform(X_test_input_raw)
    
    for key in models_dict:
        model_name = key
        model = models_dict[key]
        model.fit(X_train, y_train_input)
        predicted_y = model.predict(X_test)
        
        output_accuracy(y_test_input, predicted_y, model_name)
        
        return_trained_models[model_name] = model
        
    return (return_trained_models, return_vectorizer)

In [18]:
def create_models():
    models = {}
    models['LinearSVC'] = LinearSVC()
    models['LogisticRegression'] = LogisticRegression()
    models['RandomForestClassifier'] = RandomForestClassifier()
    models['DecisionTreeClassifier'] = DecisionTreeClassifier()
    return models

In [19]:
models = create_models()
trained_models, fitted_vectorizer = test_models(X_train_raw, y_train, X_test_raw, y_test, models)

Model Name: LinearSVC
Test accuracy: 1.0000
Test precision: 1.0000

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000        57
          1     1.0000    1.0000    1.0000        43

avg / total     1.0000    1.0000    1.0000       100

Model Name: RandomForestClassifier
Test accuracy: 0.9600
Test precision: 0.9149

             precision    recall  f1-score   support

          0     1.0000    0.9298    0.9636        57
          1     0.9149    1.0000    0.9556        43

avg / total     0.9634    0.9600    0.9602       100

Model Name: LogisticRegression
Test accuracy: 0.9900
Test precision: 0.9773

             precision    recall  f1-score   support

          0     1.0000    0.9825    0.9912        57
          1     0.9773    1.0000    0.9885        43

avg / total     0.9902    0.9900    0.9900       100

Model Name: DecisionTreeClassifier
Test accuracy: 0.9300
Test precision: 0.8600

             precision    recall  f1-score   sup

In [20]:
row = df_test[1:2]
test_msg = row['message']
test_class = row['class']

In [21]:
test_msg

1    Again on quaff nothing. It explore stood usby ...
Name: message, dtype: object

In [22]:
#test_msg = ['this is my test message']
transformed_test_msg = fitted_vectorizer.transform(test_msg)
prediction = trained_models['DecisionTreeClassifier'].predict(transformed_test_msg)

In [23]:
prediction

array([0])

In [24]:
test_class

1    0
Name: class, dtype: int64

In [25]:
# import out grid search module
from sklearn.model_selection import GridSearchCV


def get_best_model_and_accuracy(model, params, X, y):
    grid = GridSearchCV(model, # the model to grid search
                        params, # the parameter set to try 
                        error_score=0.) # if a parameter set raises an error, continue and set the performance as a big, fat 0
    grid.fit(X, y) # fit the model and parameters
    # our classical metric for performance
    print("Best Accuracy: {}".format(grid.best_score_))
    # the best parameters that caused the best accuracy
    print("Best Parameters: {}".format(grid.best_params_))
    # the average time it took a model to fit to the data (in seconds)
    print("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))
    # the average time it took a model to predict out of sample data (in seconds)
    # this metric gives us insight into how this model will perform in real-time analysis
    print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))