In [114]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, classification_report, accuracy_score


In [115]:
def get_data():
    file_name = './SpamDetectionData.txt'
    rawdata = open(file_name, 'r')
    lines = rawdata.readlines()
    lines = lines[1:] #get rid of "header"
    spam_train = lines[0:1000]
    ham_train = lines[1002:2002]
    test_mix = lines[2004:]
    return (spam_train, ham_train, test_mix)


In [116]:
spam_train, ham_train, test_mix = get_data()


In [117]:
len(test_mix)


100

In [118]:
train_data = spam_train + ham_train


In [119]:
def create_dataframe(input_array):    
    spam_indcator = 'Spam,<p>'
    message_class = np.array([1 if spam_indcator in item else 0 for item in input_array])
    data = pd.DataFrame()
    data['class'] = message_class
    data['message'] = input_array
    return data

In [120]:
df_train = create_dataframe(train_data)


In [121]:
df_test = create_dataframe(test_mix)


In [122]:
words_to_remove = ['Ham,<p>', 'Spam,<p>', '<p>', '</p>', '\n']


In [123]:
def remove_words(input_line, key_words=words_to_remove):
    temp = input_line
    for word in key_words:
        temp = temp.replace(word, '')
    return temp


In [124]:
def remove_words_and_shuffle(input_dataframe):
    input_dataframe['message'] = input_dataframe['message'].apply(remove_words)
    messages, classes = shuffle(input_dataframe['message'], input_dataframe['class'], random_state=7)
    df_return = pd.DataFrame()
    df_return['class'] = classes
    df_return['message'] = messages
    return df_return 


In [125]:
df_train = remove_words_and_shuffle(df_train)


In [126]:
df_test = remove_words_and_shuffle(df_test)


In [192]:
df_train['class'].unique()

array([0, 1])

In [179]:
train_values_array = df_train.values 

In [197]:
X_train_raw = train_values_array[:,1]
y_train = train_values_array[:,0]

In [198]:
test_values_array = df_test.values 

In [199]:
X_test_raw = test_values_array[:,1]
y_test = test_values_array[:,0]

In [200]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)

In [202]:
y_train

array([0, 0, 1, ..., 1, 0, 1], dtype=object)

In [207]:
#X_train_raw, X_test_raw, y_train, y_test = train_test_split(df_train['message'],df_train['class'])

In [226]:
X_train_raw = df_train['message']
y_train = df_train['class']

In [216]:
#X_test_raw, X_x_raw, y_test, y_x_test = train_test_split(df_test['message'],df_test['class'])

In [227]:
X_test_raw = df_test['message']
y_test = df_test['class']

In [229]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)

model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [230]:
X_test = vectorizer.transform(X_test_raw)
p = model.predict(X_test)

In [231]:
# vectorizer = TfidfVectorizer()
# X_train = vectorizer.fit_transform(X_train_raw)
# classifier = LogisticRegression()
# classifier.fit(X_train, y_train)

# X_test = vectorizer.transform( ['URGENT! Your Mobile No 1234 was awarded a Prize', 'Hey honey, whats up?'] )
# predictions = classifier.predict(X_test)

In [232]:
#print(p)

In [233]:
print('Test accuracy: {:.4f}'.format(accuracy_score(y_test, p)))
print("Test precision: {:.4f}".format(precision_score(y_test, p)))
print("")
print(classification_report(y_test, p, digits=4))

Test accuracy: 1.0000
Test precision: 1.0000

             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000        57
          1     1.0000    1.0000    1.0000        43

avg / total     1.0000    1.0000    1.0000       100



In [242]:
spam1_raw = df_test[87:88]['message']

In [243]:
spam1 = vectorizer.transform(spam1_raw)

In [244]:
model.predict(spam1)

array([1])