## **In this practice we will use natural language processing to build a spam classifier**

## **We will make use of the sms spam classification data for the implementation**

**Data processing**
*   Import the required packages
*   Load the data into train and test variables
*   Remove the unwanted data columns
*   Build wordcloud to see which message is spam and which is not.
*   Remove the stop words and punctuations
*   Convert the text data into vectors

**Building a classification model**
*   Split the data into train and test sets
*   Use Sklearn built in classifiers to build the models
*   Train the data on the model
*   Make predictions on new data













## **Import the required packages that is to be used**

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import sklearn
import pickle
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import GridSearchCV,train_test_split,StratifiedKFold,cross_val_score,learning_curve

## **Preprocessing and Exploring the Dataset**

In [None]:
data = pd.read_csv('spam.csv', encoding='latin-1')
data.head()

## **Removing unwanted columns**

In [None]:
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v2" : "text", "v1":"label"})

In [None]:
data[1990:2000]

In [None]:
data['label'].value_counts()

In [None]:
# Import nltk packages and Punkt Tokenizer Models
import nltk
nltk.download("punkt")
import warnings
warnings.filterwarnings('ignore')

### **WordClouds- to see which words are common in SPAM and NOT SPAM mesaages**

In [None]:
ham_words = ''
spam_words = ''

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# Creating a corpus of spam messages
for val in data[data['label'] == 'spam'].text:
    text = val.lower()
    tokens = nltk.word_tokenize(text)
    for words in tokens:
        spam_words = spam_words + words + ' '
# Creating a corpus of ham messages        
for val in data[data['label'] == 'ham'].text:
    text = val.lower()
    tokens = nltk.word_tokenize(text)
    for words in tokens:
        ham_words = ham_words + words + ' '

## **Creating  Spam wordcloud and ham wordcloud**

In [None]:
spam_wordcloud = WordCloud(width=500, height=300).generate(spam_words)
ham_wordcloud = WordCloud(width=500, height=300).generate(ham_words)

In [None]:
#Spam Word cloud
plt.figure( figsize=(10,8), facecolor='w')
plt.imshow(spam_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
#Creating Ham wordcloud
plt.figure( figsize=(10,8), facecolor='g')
plt.imshow(ham_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
data = data.replace(['ham','spam'],[0, 1]) 

In [None]:
data.head(10)

## **Removing Stopwords from the messages**

In [None]:
import nltk
nltk.download('stopwords')

## **Remove punctuation  and stopwords**

In [None]:
#remove the punctuations and stopwords
import string
def text_process(text):
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    
    return " ".join(text)

In [None]:
data['text'] = data['text'].apply(text_process)

In [None]:
data.head()

In [None]:
text = pd.DataFrame(data['text'])
label = pd.DataFrame(data['label'])
label

## **Converting words to vectors**



In [None]:
## Counting how many times a word appears in the dataset

from collections import Counter

total_counts = Counter()
for i in range(len(text)):
    for word in text.values[i][0].split(" "):
        total_counts[word] += 1

print("Total words in data set: ", len(total_counts))

In [None]:
# Sorting in decreasing order (Word with highest frequency appears first)
vocab = sorted(total_counts, key=total_counts.get, reverse=True)
print(vocab[:60])

In [None]:
# Mapping from words to index

vocab_size = len(vocab)
word2idx = {}
#print vocab_size
for i, word in enumerate(vocab):
    word2idx[word] = i

In [None]:
# Text to Vector
def text_to_vector(text):
    word_vector = np.zeros(vocab_size)
    for word in text.split(" "):
        if word2idx.get(word) is None:
            continue
        else:
            word_vector[word2idx.get(word)] += 1
    return np.array(word_vector)

In [None]:
# Convert all titles to vectors
word_vectors = np.zeros((len(text), len(vocab)), dtype=np.int_)
for i, (_, text_) in enumerate(text.iterrows()):
    word_vectors[i] = text_to_vector(text_[0])

## **Converting words to vectors using TFIDF Vectorizer**

In [None]:
#convert the text data into vectors
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data['text'])
vectors.shape

In [None]:
#features = word_vectors
features = vectors

## **Splitting into training and test set**

In [None]:
#split the dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(features, data['label'], test_size=0.15, random_state=111)

## **Classifying using sklearn pre built classifiers**

In [None]:
#import sklearn packages for building classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
#initialize multiple classification models 
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier(n_neighbors=49)
mnb = MultinomialNB(alpha=0.2)
dtc = DecisionTreeClassifier(min_samples_split=7, random_state=111)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=31, random_state=111)

In [None]:
#create a dictionary of variables and models
clfs = {'SVC' : svc,'KN' : knc, 'NB': mnb, 'DT': dtc, 'LR': lrc, 'RF': rfc}

In [None]:
#fit the data onto the models
def train(clf, features, targets):    
    clf.fit(features, targets)

def predict(clf, features):
    return (clf.predict(features))

In [None]:
pred_scores_word_vectors = []
for k,v in clfs.items():
    train(v, X_train, y_train)
    pred = predict(v, X_test)
    pred_scores_word_vectors.append((k, [accuracy_score(y_test , pred)]))

## **Predictions using TFIDF Vectorizer algorithm**

In [None]:
pred_scores_word_vectors

##**Model predictions**





In [None]:
#write functions to detect if the message is spam or not
def find(x):
    if x == 'spam':
        print ("Message is SPAM")
    else:
        print ("Message is NOT Spam")

In [None]:
newtext = ["win FREE mobiles and play games"]
integers = vectorizer.transform(newtext)

In [None]:
x = mnb.predict(integers)
find(x)        