In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problem Statement
The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam. 

# Why SpaCy??
spaCy is an open-source natural language processing library for Python. It is designed particularly for production use, and it can help us to build applications that process massive volumes of text efficiently.

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

import warnings
warnings.filterwarnings('ignore')

# Read data

In [None]:
df = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv', encoding="latin-1")
df = df.dropna(how = 'any', axis = 1)
df.columns = ['label', 'message']
df.head()

In [None]:
# counting ham and spam
df['label'].value_counts().plot(kind = 'bar', color = ['blue', 'red'])
plt.title('Distribution of labels');

In [None]:
df.isnull().sum()

In [None]:
df.groupby('label').describe()

In [None]:
# plotting graph by length of the message

ham = df[df['label'] == 'ham']['message'].str.len()
spam = df[df['label'] == 'spam']['message'].str.len()

sns.distplot(ham, label = 'ham')
sns.distplot(spam, label = 'spam')
plt.title('Distribution by Length')
plt.legend()

In [None]:
print('max length of ham:', ham.max())
print('max length of spam:', spam.max())

# Clean the data, dropping the stop words and pass through lemmatization

In [None]:
# create our list of punctuation marks
punct = string.punctuation
punct

In [None]:
!python -m spacy download en_vectors_web_lg
!python -m spacy link en_vectors_web_lg en_vectors_web_lg_link

In [None]:
# create our list of stopwords
nlp = spacy.load('en_vectors_web_lg_link')
stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
        
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
            
    return cleaned_tokens

# Vectorizing the Text

In [None]:
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)

# Building Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB()

In [None]:
X = df['message']
y = df['label']

In [None]:
# train test split 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state = 42)

In [None]:
X_train.shape,X_test.shape

# Creating a Pipeline and Generating the Model using tfidf vectorization

In [None]:
clf = Pipeline([('tfidf', tfidf), ('spam_detect_model', spam_detect_model)])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}%".format(round((accuracy_score(y_test,y_pred)*100),2)))
print(classification_report(y_test,y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot = True, cmap= 'Blues')

# Test Against New messages

In [None]:
def pred(msg):
    predictions = clf.predict([msg])
    return print(predictions)

In [None]:
pred("Go until jurong point, crazy")

In [None]:
pred("You Have a Refund Coming")