# SMS Spam Collection Data Set

**Classify SMS SPAM-HAM**

**Associated Tasks:** Classification, Clustering.

**Data Set Information:**
A collection of 425 SMS spam messages was manually extracted from the Grumbletext Web site. The collection is composed by just one text file, where each line has the correct class followed by the raw message. This is a UK forum in which cell phone users make public claims about SMS spam messages.

**Label:**

**spam**: message is spam

**ham**: message is nomral

In [None]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

import string
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# EDA

In [None]:
sms_data = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv")
#sms_data.rename(columns={"v1":"class", "v2":"sms"}, inplace=True)
sms_data.head()

In [None]:
# Loading the data
# drop empty columns
## rename the columns

sms_data = sms_data.iloc[:,:2]
sms_data.rename(columns={"v1":"class", "v2":"sms"}, inplace=True)
sms_data.head()

In [None]:
sms_data.shape #(Cols, Rows)

In [None]:
sms_data['class'].value_counts()

> convert all messages to lower case

In [None]:
# convert all messages to lower case
sms_data['sms'] = sms_data['sms'].str.lower()
sms_data.head(16)

# Data Visualization

In [None]:
# Countplot for Spam vs. Ham as imbalanced dataset
plt.figure(figsize=(8,8))
sns.countplot(x='class', data=sms_data)
plt.title('Countplot Spam vs Ham SMS')
plt.xlabel('Spam vs Ham SMS')
plt.ylabel('Count')

In [None]:
sms_data_spam = sms_data[sms_data['class'] == "spam"]
sms_data_ham  = sms_data[sms_data['class'] == "ham"]


In [None]:
sms_data_spam

# Visualizing Text Data Using a Word Cloud

In [None]:
stopwords = set(STOPWORDS)

wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stopwords,
                          #max_words=200,
                          #max_font_size=80, 
                          random_state=42
                         ).generate(str(sms_data_spam['sms']))

#print(wordcloud)
fig = plt.figure(1)
plt.title("words in SPAM SMS")
plt.axis('off')
plt.imshow(wordcloud)
plt.show()


In [None]:
stopwords = set(STOPWORDS)

wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stopwords,
                          #max_words=200,
                          #max_font_size=80, 
                          #width=1500,
                          #height=1500,
                          random_state=42
                         ).generate(str(sms_data_ham['sms']))

#print(wordcloud)
fig = plt.figure(1)
plt.title("words in Normal (HAM) SMS")
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

In [None]:
keywords = ['click', 'offer', 'winner', 'buy', 'free',
            'cash', 'urgent', 'money','password',
            'winner'] 


for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    sms_data[str(key)] = sms_data.sms.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [None]:
sms_data.tail(10)

In [None]:
sms_data.corr()

In [None]:
# cmap="Reds" to control the color in seaborn heatmaps 
sns.heatmap(sms_data.corr(), cmap="Reds")

# Build Training Model. SKLearn.

In [None]:
sms_data['class']=sms_data['class'].map({'ham':0 , 'spam':1})

In [None]:
x=sms_data['sms']
y=sms_data['class']

In [None]:
from sklearn.model_selection import train_test_split

#split dataset to 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=5)

print("(X_train), (y_train) ", X_train.shape, y_train.shape)
print("(X_test), (y_test) ", X_test.shape,  y_test.shape)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
print(train_vectors.shape, test_vectors.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from  sklearn.metrics  import accuracy_score
clf=MultinomialNB().fit(train_vectors,y_train)

In [None]:
def classify_sms(text):
    text = vectorizer.transform(text)
    predicted_sms = clf.predict(text)
    if predicted_sms==0:
        print("Not Spam (HAM)")
    else:
        print("SPAM")
        

In [None]:
#call classify_sms function passing SMS message as parameter 
message1=["WINNER!! As a valued network customer you have been selected to receivea آ£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]
classify_sms(message1)

In [None]:
message2=["FREE entry into our آ£250 weekly comp just send the word WIN to 80086 NOW. 18 T&C www.txttowin.co.uk"]
classify_sms(message2)

In [None]:
message3=["I see the letter B on my car"]
classify_sms(message3)

In [None]:
Please consider upvoting if you find it useful to you.

Thanks.