In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from wordcloud import WordCloud, STOPWORDS
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score as acs
import matplotlib.pyplot as plt
import seaborn as sns

# Warnings
import warnings
warnings.filterwarnings('ignore')

**SMS Spam Collection dataset**

In [None]:
data = pd.read_csv("/kaggle/input/spam-ham-dataset/SMSSpamCollection1.csv",encoding = 'latin-1')
data = data.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
data.columns = ["label", "body_text"]
data.head()

In [None]:
data['label'].value_counts().plot(kind = 'pie', explode = [0, 0.15], figsize = (8, 8), autopct = '%1.1f%%', shadow = True)
plt.xlabel("Spam vs Ham",size=15)
plt.ylabel(" ")
plt.legend(["ham", "spam"])
plt.show()

**Preprocessing**

In [None]:
#Extracting spam and ham words from messages
spam_messages = data[data["label"] == "spam"]["body_text"]
ham_messages = data[data["label"] == "ham"]["body_text"]

spam_words = []
ham_words = []

def extractSpamWords(spamMessages):
    global spam_words
    words = [word.lower() for word in word_tokenize(spamMessages) if word.lower() 
             not in stopwords.words("english") and word.lower().isalpha()]
    spam_words = spam_words + words
    
def extractHamWords(hamMessages):
    global ham_words
    words = [word.lower() for word in word_tokenize(hamMessages) if word.lower() 
             not in stopwords.words("english") and word.lower().isalpha()]
    ham_words = ham_words + words
    
spam_messages.apply(extractSpamWords)
ham_messages.apply(extractHamWords)

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer() #Stemming

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text
data.head()

# body_len shows the length of words excluding whitespaces in a message body.
# punct% shows the percentage of punctuation marks in a message body.

In [None]:
plt.figure(figsize=(20,12))

#Spam Word cloud
plt.subplot(2,2,2, facecolor='k')
spam_wordcloud = WordCloud(width=600, height=400,collocations=False,
                           random_state=1,background_color="red").generate(" ".join(spam_words))
plt.imshow(spam_wordcloud)
plt.title("Spam Word Cloud",size=20)
plt.axis("off")

#Ham word cloud
plt.subplot(2,2,1, facecolor='k')
ham_wordcloud = WordCloud(width=600, height=400,collocations=False,
                          random_state=1,background_color="green").generate(" ".join(ham_words))
plt.imshow(ham_wordcloud)
plt.title("Ham Word Cloud",size=20)
plt.axis("off")

plt.tight_layout()
plt.show()

In [None]:
# Top 10 spam words
spam_words = np.array(spam_words)
print("Top 10 Spam words are :\n")
display(pd.Series(spam_words).value_counts().head(n = 10))

# Top 10 Ham words
ham_words = np.array(ham_words)
print("Top 10 Ham words are :\n")
display(pd.Series(ham_words).value_counts().head(n = 10))

In [None]:
data["messageLength"] = data["body_text"].apply(len)
f, ax = plt.subplots(1, 2, figsize = (20, 6))

sns.distplot(data[data["label"] == "spam"]["messageLength"], bins = 20, ax = ax[0],color="red")
ax[0].set_xlabel("Spam Message Length",fontSize=20)
ax[0].grid()

sns.distplot(data[data["label"] == "ham"]["messageLength"], bins = 20, ax = ax[1],color="green")
ax[1].set_xlabel("Ham Message Length",fontSize=20)
ax[1].grid()
plt.show()

In [None]:
X=data[['body_text', 'body_len', 'punct%']]
Y=data['label']

#Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

__Vectorize Text__

In [None]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

__Testing the model__

In [None]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)
rf_model = rf.fit(X_train_vect, y_train)

y_pred = rf_model.predict(X_test_vect)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label="spam", average='binary')
print('Precision: {} \nRecall: {} \nF1-Score: {} \nAccuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore,3), round(acs(y_test,y_pred), 3)*100) +"% \n")

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
class_label = ["ham", "spam"]
df_cm = pd.DataFrame(cm, index=class_label,columns=class_label)
print("The confusion matrix is:\n {}".format(df_cm))

#Plot
plt.figure(figsize=(10,7))
sns.heatmap(df_cm, annot=True, fmt='d')
plt.title("Confusion Matrix",fontsize=15)
plt.xlabel("Predicted Label",fontsize=15)
plt.ylabel("True Label",fontsize=15)
plt.show()