Problem statement

Detect whether a text message is spam or ham.

The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.

Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Load file

In [None]:
# load datasets
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding="ISO-8859-1")
df

Drop columns

In [None]:
df.drop(['Unnamed: 2', 'Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)
df

Check for null values

In [None]:
df.isnull().sum()

Analyse target

In [None]:
target_count = df.groupby('v1').v1.count()
target_count

In [None]:
percent_target = (target_count / len(df)) * 100
percent_target

In [None]:
df.groupby('v1').v1.count().plot.bar(ylim=0)
plt.show()

Map v1

In [None]:
dic = {'ham':1 ,'spam':0}
df.v1 = df.v1.map(dic)
df

Preprocess raw text and get ready for machine learning

In [None]:
#create new column
df['processedtext'] = df['v2']
df

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
import warnings
warnings.filterwarnings('ignore')

stemmer = PorterStemmer()
words = stopwords.words("english")

df['processedtext'] = df['processedtext'].apply(lambda x: " ".join([stemmer.stem(i) 
for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [None]:
#make all words lower case
df['processedtext'] = df['processedtext'].str.lower()

# remove special characters, numbers, punctuations
df['processedtext'] = df['processedtext'].str.replace("[^a-zA-Z#]", " ")

#remove words less than 3 characters
df['processedtext'] = df['processedtext'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [None]:
spam_words = ' '.join([text for text in df['processedtext']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(spam_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

Define X and y variables

In [None]:
#define X and y
y = df['v1']
X = df['processedtext']

Convert text to word frequency vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
df_tfIdf = vectorizer_tfidf.fit_transform(X.values.astype('U'))
print(vectorizer_tfidf.get_feature_names()[:10])

Split X for training and validation

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df_tfIdf, y, test_size=0.10, random_state=1, shuffle=True)
X_train.shape, X_val.shape, y_train.shape,y_val.shape

Select model

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier

model = PassiveAggressiveClassifier(max_iter=1000, random_state=1,tol=1e-3).fit(X_train, y_train)
print(model.score(X_train, y_train))

Predict on validation set

In [None]:
y_pred = model.predict(X_val)
print(model.score(X_val, y_val))

Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_val,y_pred))

In [None]:
df_val = pd.DataFrame({'Actual': y_val, 'Predicted':y_pred})
df_val

Plot errors

In [None]:
from sklearn.decomposition import TruncatedSVD

svd_val = TruncatedSVD(n_components=2, random_state=1)
principalComponents_val = svd_val.fit_transform(X_val)

In [None]:
plt.figure(figsize = (12, 8))
plt.scatter(principalComponents_val[:, 0], principalComponents_val[:,1], c = y_pred == y_val - 1, alpha = .8, s = 50)