In [None]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [None]:
df = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv',encoding='latin-1')
df.head()

In [None]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
df.head()

In [None]:
df.rename(columns = {'v1':'labels', 'v2':'message'}, inplace=True)
df.head()

In [None]:
df.shape

In [None]:
df.drop_duplicates(inplace=True)
df.shape

In [None]:
df['label'] = df['labels'].map({'ham': 0, 'spam': 1})
df.head()

In [None]:
df.drop(['labels'], axis=1, inplace=True)
df.head() 

In [None]:
import string

def preprocess_text(message):
    
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    
    """
    # Check characters to see if they are in punctuation
    without_punc = [char for char in message if char not in string.punctuation]

    # Join the characters again to form the string.
    without_punc = ''.join(without_punc)
    
    # Now just remove any stopwords
    return [word for word in without_punc.split() if word.lower() not in stopwords.words('english')]

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
df['message'].head().apply(preprocess_text)

In [None]:
from wordcloud import WordCloud

spam_words = ' '.join(list(df[df['label'] == 1]['message']))
spam_wc = WordCloud(width = 512,height = 512).generate(spam_words)
plt.figure(figsize = (10, 8), facecolor = 'k')
plt.imshow(spam_wc)
plt.show()

In [None]:
from wordcloud import WordCloud

ham_words = ' '.join(list(df[df['label'] == 0]['message']))
ham_wc = WordCloud(width = 512,height = 512).generate(ham_words)
plt.figure(figsize = (10, 8), facecolor = 'k')
plt.imshow(ham_wc)
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
x = df['message']
y = df['label']
cv = CountVectorizer()
x= cv.fit_transform(x)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(x_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(x_test)
print(classification_report(y_test, pred))
print()
print('Confusion Matrix:\n',confusion_matrix(y_test, pred))
print()
print('Accuracy : ',accuracy_score(y_test, pred))

In [None]:
# print the predictions
print(classifier.predict(x_test))

# print the actual values
print(y_test.values)

# Testing on Random Text

In [None]:
def sms(text):
    
    # creating a list of labels
    lab = ['not spam','spam'] 
    
    # perform tokenization
    x = cv.transform(text).toarray()
    
    # predict the text
    p = classifier.predict(x)
    
    # convert the words in string with the help of list
    s = [str(i) for i in p]
    a = int("".join(s))
    
    # show out the final result
    res = str("This message is looking: "+ lab[a])
    print(res)

In [None]:
sms(['Congratulations, your entry into our contest last month made you a WINNER! goto our website to claim your price! You have 24 hours to claim.'])