##### Statement Regarding the Problem:
We are going to build a Spam Message Detection Model to detect spam messages, we have both spam and ham message in the data we have to interpret whether the message is spam or not

In [None]:
#Importing the Required Librarires
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Loading Data Set

df = pd.read_csv("../input/spam-text-message-classification/SPAM text message 20170820 - Data.csv")
df.head()

In [None]:
spam = df[df.Category == 'spam']
spam.head()

In [None]:
ham = df[df.Category == 'ham']
ham.head()

In [None]:
# Spam Word clouds
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
text = " ".join(review for review in spam.Message)
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
fig = plt.figure(figsize = (20, 6)) 
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Ham Word clouds
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
text = " ".join(review for review in ham.Message)
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
fig = plt.figure(figsize = (20, 6)) 
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#ham and spam distributions
df_Spam=df[['Category','Message']].groupby('Category').count().reset_index()
df_Spam.columns=['Category','count']
df_Spam['percentage']=(df_Spam['count']/df_Spam['count'].sum())*100
df_Spam

In [None]:
#Encoding dependent variable

encode = {
            "Category":     {"ham": 0, "spam": 1}
         
         }

df = df.replace(encode)
df.head()

In [None]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split
X = df['Message']
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = None)

### CountVectorizer
CountVectorizer tokenizes(tokenization means breaking down a sentence or paragraph or any text into words) the text along with performing very basic preprocessing like removing the punctuation marks, converting all the words to lowercase, etc.


The vocabulary of known words is formed which is also used for encoding unseen text later.

An encoded vector is returned with a length of the entire vocabulary and an integer count for the number of times each word appeared in the document

In [None]:
#importing CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()


### Building Multinomial Navie Baye's Classifier

Types of Navie Baye's Classifier::

Gaussian: It is used in classification and it assumes that features follow a normal distribution.

Multinomial: suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. “number of times outcome number x_i is observed over the n trials”.

Bernoulli: The binomial model is useful if your feature vectors are binary (i.e. zeros and ones). One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.


In [None]:

from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()
NB.fit(vectorizer.fit_transform(X_train), y_train)


### Model Evaluation

In [None]:
#Prediction on train set
prediction = NB.predict(vectorizer.transform(X_train))

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
#Confusion matrix
print(confusion_matrix(y_train, prediction))

In [None]:
#Classification Report
print(classification_report(y_train, prediction))

In [None]:
#Prediction on test set
prediction_test = NB.predict(vectorizer.transform(X_test))
#Confusion matrix
print(confusion_matrix(y_test, prediction_test))

In [None]:
#Classification Report
print(classification_report(y_test, prediction_test))

### Logistic Regression Model

In [None]:
#Importing Library
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression(random_state=42)

In [None]:
#Building Logistic Regression  Model
logreg.fit(vectorizer.fit_transform(X_train), y_train)

In [None]:
#Prediction on train set
prediction_logistic = logreg.predict(vectorizer.transform(X_train))
#Confusion matrix
print(confusion_matrix(y_train, prediction_logistic))

In [None]:
#classification_report
print(classification_report(y_train, prediction_logistic))

In [None]:
#Evaluating on test data

prediction_logistic_test = logreg.predict(vectorizer.transform(X_test))
#Confusion matrix
print(confusion_matrix(y_test, prediction_logistic_test))
print(classification_report(y_test, prediction_logistic_test))

### SVM Classifier

In [None]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(vectorizer.fit_transform(X_train), y_train)

In [None]:
pred_svm_test = svclassifier.predict(vectorizer.transform(X_test))


In [None]:
print(confusion_matrix(y_test, pred_svm_test))
print(classification_report(y_test, pred_svm_test))

##### Results

##### SVM :

recall: 89%

accuracy: 98%

##### Multinomial Navie Bayes :

recall: 92%

accuracy: 99%

##### Logistic Regression :

recall: 86%

accuracy: 98%


##### TF-IDF method:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer()
NB = MultinomialNB()
NB.fit(vectorizer.fit_transform(X_train), y_train)


In [None]:
#Prediction on train set
prediction = NB.predict(vectorizer.transform(X_train))

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
#Confusion matrix
print(confusion_matrix(y_train, prediction))

In [None]:
#Classification Report
print(classification_report(y_train, prediction))

In [None]:
#Prediction on test set
prediction_test = NB.predict(vectorizer.transform(X_test))
#Confusion matrix
print(confusion_matrix(y_test, prediction_test))

In [None]:
#Classification Report
print(classification_report(y_test, prediction_test))

In [None]:
#Importing Library
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression(random_state=42)

#### Logistic Regression

In [None]:
#Building Logistic Regression  Model
logreg.fit(vectorizer.fit_transform(X_train), y_train)

In [None]:
#Prediction on train set
prediction_logistic = logreg.predict(vectorizer.transform(X_train))
#Confusion matrix
print(confusion_matrix(y_train, prediction_logistic))

In [None]:
#classification_report
print(classification_report(y_train, prediction_logistic))

In [None]:
#Evaluating on test data

prediction_logistic_test = logreg.predict(vectorizer.transform(X_test))
#Confusion matrix
print(confusion_matrix(y_test, prediction_logistic_test))
print(classification_report(y_test, prediction_logistic_test))

#### SVM

In [None]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(vectorizer.fit_transform(X_train), y_train)

In [None]:
pred_svm_test = svclassifier.predict(vectorizer.transform(X_test))


In [None]:
print(confusion_matrix(y_test, pred_svm_test))
print(classification_report(y_test, pred_svm_test))

By checking Countvectorizer, TF-IDF methods to convert messages into vectors Countvectorizer is more efficient(for this problem) as it gives better performance than TF-IDF 

* 