In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

spam_data = pd.read_csv('../input/spam.csv', encoding='latin-1')
spam_data.head()

In [None]:
spam_data = spam_data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)
spam_data = spam_data.rename(columns = {'v1': 'target','v2': 'text'})

spam_data.head()

### Pre-processing
Before training any model, let's do more data pre-processing. Specifically, I'm goint to:

- Remove stop words
- Put all phrases in lower case
- Stem words

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

In [None]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

In [None]:
spam_data['parsed'] = spam_data['text'].apply(lambda x: x.lower())
spam_data['parsed'] = spam_data['text'].apply(lambda x: word_tokenize(x))
spam_data['parsed'] = spam_data['text'].apply(lambda x: [word for word in str(x).split() if word not in stop_words])
spam_data['parsed'] = spam_data['parsed'].apply(lambda x: [stemmer.stem(word) for word in x])
spam_data['parsed'] = spam_data['parsed'].apply(lambda x: ' '.join(x))

In [None]:
spam_data.head()

### Data Exploration

In [None]:
s = spam_data['target'].value_counts()
sns.barplot(x=s.values, y=s.index)
plt.title('Data Distribution')

In [None]:
s1 = spam_data[spam_data['target'] == 'ham']['parsed'].str.len()
sns.distplot(s1, label='Ham')
s2 = spam_data[spam_data['target'] == 'spam']['parsed'].str.len()
sns.distplot(s2, label='Spam')
plt.title('Lenght Distribution')
plt.legend()

We can notice that spams messages are often longer than ham messages.

In [None]:
s1 = spam_data[spam_data['target'] == 'ham']['parsed'].str.replace(r'\D+', '').str.len()
sns.distplot(s1, label='Ham')
s2 = spam_data[spam_data['target'] == 'spam']['parsed'].str.replace(r'\D+', '').str.len()
sns.distplot(s2, label='Spam')
plt.title('Digits Distribution')
plt.legend()

From this plot, it's clear that the digits distribution in ham messages are rigth skewed, presenting lower mean of digits than spam messages.

In [None]:
s1 = spam_data[spam_data['target'] == 'ham']['parsed'].str.replace(r'\w+', '').str.len()
sns.distplot(s1, label='Ham')
s2 = spam_data[spam_data['target'] == 'spam']['parsed'].str.replace(r'\w+', '').str.len()
sns.distplot(s2, label='Spam')
plt.title('Non-Digits Distribution')
plt.legend()

These distributions resembles the one regarding the text messages length. Here, the values are smaller, though. Hams present less non-digits than spams.

In [None]:
spam_data.groupby('target').describe()

### Count Vectorizer vs. Tfidf

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spam_data['parsed'], 
                                                    spam_data['target'], 
                                                    random_state=0)

- Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)
print('Vocabulary len:', len(vect.get_feature_names()))
print('Longest word:', max(vect.vocabulary_, key=len))

X_train_vectorized = vect.transform(X_train)

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, y_train)

In [None]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

- Tfidf

Let's ignore terms that have a document frequency strictly lower than 3.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df=3).fit(X_train)
print('Vocabulary len:', len(vect.get_feature_names()))
print('Longest word:', max(vect.vocabulary_, key=len))

X_train_vectorized = vect.transform(X_train)

In [None]:
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, y_train)

In [None]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

### Feature Engineering


In the follwing, let's do some feature engineering to try to improve the performance of our model.

In [None]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

#### First Model

First, let's ignore terms that have a document frequency strictly lower than 3. Using this document-term matrix and an additional feature, the length of document (number of characters), we will test how our Tfidf performs.

In [None]:
vect = TfidfVectorizer(min_df=5).fit(X_train)
print('Vocabulary len:', len(vect.get_feature_names()))
print('Longest word:', max(vect.vocabulary_, key=len))

X_train_vectorized = vect.transform(X_train)

X_train_vectorized = add_feature(X_train_vectorized, X_train.str.len())

In [None]:
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, y_train)

In [None]:
index = np.array(vect.get_feature_names() + ['length_of_doc'])
values  = model.coef_[0]
features_series = pd.Series(data=values,index=index)

print('Smallest Coefs:\n{}\n'.format(features_series.nsmallest(10).index.values.tolist()))
print('Largest Coefs: \n{}'.format(features_series.nlargest(10).index.values.tolist()))

In [None]:
X_test_vectorized = vect.transform(X_test)
X_test_vectorized = add_feature(X_test_vectorized, X_test.str.len())
    
y_pred = model.predict(X_test_vectorized)
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

#### Second Model

Now let's use a Tfidf ignoring terms that have a document frequency strictly lower than 5 and using word n-grams from n=1 to n=3 (unigrams, bigrams and trigrams).

We will also make use of the following additional features:
- the length of document (number of characters)
- number of digits per document

In [None]:
vect = TfidfVectorizer(min_df=5, ngram_range=(1, 3)).fit(X_train)
print('Vocabulary len:', len(vect.get_feature_names()))
print('Longest word:', max(vect.vocabulary_, key=len))

X_train_vectorized = vect.transform(X_train)

X_train_vectorized = add_feature(X_train_vectorized, X_train.str.len())
X_train_vectorized = add_feature(X_train_vectorized, X_train.str.replace(r'\D+', '').str.len())

In [None]:
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, y_train)

In [None]:
index = np.array(vect.get_feature_names() + ['length_of_doc', 'digit_count'])
values  = model.coef_[0]
features_series = pd.Series(data=values,index=index)

print('Smallest Coefs:\n{}\n'.format(features_series.nsmallest(10).index.values.tolist()))
print('Largest Coefs: \n{}'.format(features_series.nlargest(10).index.values.tolist()))

In [None]:
X_test_vectorized = vect.transform(X_test)
X_test_vectorized = add_feature(X_test_vectorized, X_test.str.len())
X_test_vectorized = add_feature(X_test_vectorized, X_test.str.replace(r'\D+', '').str.len())
    
y_pred = model.predict(X_test_vectorized)
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

#### Third Model
Finally, let's use a Count Vectorizer ignoring terms that have a document frequency strictly lower than 5 and using character n-grams from n=2 to n=5.
To tell Count Vectorizer to use character n-grams we pass in analyzer='char_wb' which creates character n-grams only from text inside word boundaries. This should make the model more robust to spelling mistakes.

At this time we are going to use these additional features:
- the length of document (number of characters)
- number of digits per document
- number of non-word characters (anything other than a letter, digit or underscore.)

In [None]:
vect = CountVectorizer(min_df=5, ngram_range=(2, 5), analyzer='char_wb').fit(X_train)
print('Vocabulary len:', len(vect.get_feature_names()))
print('Longest word:', max(vect.vocabulary_, key=len))

X_train_vectorized = vect.transform(X_train)

X_train_vectorized = add_feature(X_train_vectorized, X_train.str.len())
X_train_vectorized = add_feature(X_train_vectorized, X_train.str.replace(r'\D+', '').str.len())
X_train_vectorized = add_feature(X_train_vectorized, X_train.str.replace(r'\w+', '').str.len())

In [None]:
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, y_train)

In [None]:
index = np.array(vect.get_feature_names() + ['length_of_doc', 'digit_count', 'non_word_char_count'])
values = model.coef_[0]
features_series = pd.Series(data=values,index=index)

print('Smallest Coefs:\n{}\n'.format(features_series.nsmallest(10).index.values.tolist()))
print('Largest Coefs: \n{}'.format(features_series.nlargest(10).index.values.tolist()))

In [None]:
X_test_vectorized = vect.transform(X_test)
X_test_vectorized = add_feature(X_test_vectorized, X_test.str.len())
X_test_vectorized = add_feature(X_test_vectorized, X_test.str.replace(r'\D+', '').str.len())
X_test_vectorized = add_feature(X_test_vectorized, X_test.str.replace(r'\w+', '').str.len())
    
y_pred = model.predict(X_test_vectorized)
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))