In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, classification_report, accuracy_score


In [2]:
def get_data():
    file_name = './SpamDetectionData.txt'
    rawdata = open(file_name, 'r')
    lines = rawdata.readlines()
    lines = lines[1:] #get rid of "header"
    spam_train = lines[0:1000]
    ham_train = lines[1002:2002]
    test_mix = lines[2004:]
    return (spam_train, ham_train, test_mix)


In [3]:
spam_train, ham_train, test_mix = get_data()


In [4]:
len(test_mix)


100

In [5]:
train_data = spam_train + ham_train


In [6]:
def create_dataframe(input_array):    
    spam_indcator = 'Spam,<p>'
    message_class = np.array([1 if spam_indcator in item else 0 for item in input_array])
    data = pd.DataFrame()
    data['class'] = message_class
    data['message'] = input_array
    return data

In [7]:
df_train = create_dataframe(train_data)


In [8]:
df_test = create_dataframe(test_mix)


In [9]:
words_to_remove = ['Ham,<p>', 'Spam,<p>', '<p>', '</p>', '\n']


In [10]:
def remove_words(input_line, key_words=words_to_remove):
    temp = input_line
    for word in key_words:
        temp = temp.replace(word, '')
    return temp


In [11]:
def remove_words_and_shuffle(input_dataframe):
    input_dataframe['message'] = input_dataframe['message'].apply(remove_words)
    messages, classes = shuffle(input_dataframe['message'], input_dataframe['class'], random_state=7)
    df_return = pd.DataFrame()
    df_return['class'] = classes
    df_return['message'] = messages
    return df_return 


In [12]:
df_train = remove_words_and_shuffle(df_train)


In [13]:
df_test = remove_words_and_shuffle(df_test)


In [14]:
df_train['class'].unique()

array([0, 1])

In [15]:
train_values_array = df_train.values 

In [16]:
X_train_raw = train_values_array[:,1]
y_train = train_values_array[:,0]

In [17]:
test_values_array = df_test.values 

In [18]:
X_test_raw = test_values_array[:,1]
y_test = test_values_array[:,0]

In [19]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)

In [20]:
y_train

array([0, 0, 1, ..., 1, 0, 1], dtype=object)

In [21]:
#X_train_raw, X_test_raw, y_train, y_test = train_test_split(df_train['message'],df_train['class'])

In [22]:
X_train_raw = df_train['message']
y_train = df_train['class']

In [23]:
#X_test_raw, X_x_raw, y_test, y_x_test = train_test_split(df_test['message'],df_test['class'])

In [24]:
X_test_raw = df_test['message']
y_test = df_test['class']

In [25]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)

model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
X_test = vectorizer.transform(X_test_raw)
p = model.predict(X_test)

In [27]:
# vectorizer = TfidfVectorizer()
# X_train = vectorizer.fit_transform(X_train_raw)
# classifier = LogisticRegression()
# classifier.fit(X_train, y_train)

# X_test = vectorizer.transform( ['URGENT! Your Mobile No 1234 was awarded a Prize', 'Hey honey, whats up?'] )
# predictions = classifier.predict(X_test)

In [28]:
#print(p)

In [29]:
print('Test accuracy: {:.4f}'.format(accuracy_score(y_test, p)))
print("Test precision: {:.4f}".format(precision_score(y_test, p)))
print("")
print(classification_report(y_test, p, digits=4))

Test accuracy: 1.0000
Test precision: 1.0000

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000        57
          1     1.0000    1.0000    1.0000        43

avg / total     1.0000    1.0000    1.0000       100



In [30]:
spam1_raw = df_test[87:88]['message']

In [31]:
spam1 = vectorizer.transform(spam1_raw)

In [32]:
model.predict(spam1)

array([1])

In [33]:
from sklearn.naive_bayes import GaussianNB
gaunb = GaussianNB()

In [34]:
#gaunb.fit(X_train, y_train)

In [35]:
from sklearn.svm import LinearSVC
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
#pipeline = Pipeline([('vectorizer', CountVectorizer()), ('classifier', LinearSVC())])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [36]:
#X_test = vectorizer.transform('this is my test information')

In [37]:
p_l_svc = linear_svc.predict(X_test)

In [38]:
p_l_svc

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 0])

In [39]:
print('Test accuracy: {:.4f}'.format(accuracy_score(y_test, p_l_svc)))
print("Test precision: {:.4f}".format(precision_score(y_test, p_l_svc)))
print("")
print(classification_report(y_test, p_l_svc, digits=4))


Test accuracy: 1.0000
Test precision: 1.0000

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000        57
          1     1.0000    1.0000    1.0000        43

avg / total     1.0000    1.0000    1.0000       100



In [40]:
def output_accuracy(actual_y, predicted_y, model_name):
    print("Model Name: " + model_name)
    print("Test accuracy: {:.4f}".format(accuracy_score(actual_y, predicted_y)))
    print("Test precision: {:.4f}".format(precision_score(actual_y, predicted_y)))
    print("")
    print(classification_report(actual_y, p_l_svc, digits=4))
    print("=========================================================================")

In [41]:
def test_models(X_train_input_raw, y_train_input, X_test_input_raw, y_test_input, models_dict):

    vectorizer = TfidfVectorizer()
    
    X_train = vectorizer.fit_transform(X_train_input_raw)
    X_test = vectorizer.transform(X_test_input_raw)
    
    for key in models_dict:
        model_name = key
        model = models_dict[key]
        model.fit(X_train, y_train_input)
        predicted_y = model.predict(X_test)
        
        output_accuracy(y_test_input, predicted_y, model_name)           

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
def create_models():
    models = {}
    models['LinearSVC'] = LinearSVC()
    models['LogisticRegression'] = LogisticRegression()
    models['RandomForestClassifier'] = RandomForestClassifier()
    #models['GaussianNB()'] = GaussianNB()
    return models

In [46]:
models = create_models()
test_models(X_train_raw, y_train, X_test_raw, y_test, models)

Model Name: LogisticRegression
Test accuracy: 1.0000
Test precision: 1.0000

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000        57
          1     1.0000    1.0000    1.0000        43

avg / total     1.0000    1.0000    1.0000       100

Model Name: LinearSVC
Test accuracy: 1.0000
Test precision: 1.0000

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000        57
          1     1.0000    1.0000    1.0000        43

avg / total     1.0000    1.0000    1.0000       100

Model Name: RandomForestClassifier
Test accuracy: 1.0000
Test precision: 1.0000

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000        57
          1     1.0000    1.0000    1.0000        43

avg / total     1.0000    1.0000    1.0000       100

