# **Problem Statement**
### Building a model to classify the spam messages using TF – IDF, Naïve Bayes & other NLP techniques.


### Importing the Dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv" , encoding="ISO-8859-1")
df = df[['v1', 'v2']]
df = df.rename(columns = {'v1': 'label', 'v2': 'message'})
df.head(10)

### **EDA**
### Checking the count of spam and ham messages

In [None]:
fig = plt.figure(figsize = (10,6))
sns.countplot(data=df, x='label')

### Checking the length of each messages

In [None]:
df['length'] = df['message'].apply(lambda x: len(x) - x.count(" "))
df.head()

### Histogarm to check the frequency of spam and ham messages with respect to length

#### As we can see length of most of the ham messages are in between 25-75 and most of the spam messages are in between 100 - 150
#### so length is also a feature to classify spam & ham messages

In [None]:
plt.figure(figsize=(10, 5))
bins = np.linspace(0, 200, 40)
plt.hist(df[df['label']=='ham']['length'], bins, alpha=0.5, label='ham')
plt.hist(df[df['label']=='spam']['length'], bins, alpha=0.5, label='spam')
plt.legend(loc='upper left')

### Checking the percentage of punctuation in each sentences

In [None]:
import string

def count_punct(text):
#     count = sum([1 for char in text if char in string.punctuation])
    
    count=0
    for char in text:
        if char in string.punctuation:
            count+=1
    
    return round(count/(len(text) - text.count(" ")), 3)*100

df['punct%'] = df['message'].apply(lambda x: count_punct(x))

df.head()

### Histogarm to check the frequency of spam and ham messages with respect to percentage of punctuation
#### As we can see most of the spam messages have punctuation percentage 0-10% but most of the ham messages have punctuation percentage beyond 10%
#### so we will the feature too to our model

In [None]:
plt.figure(figsize=(10, 5))
bins = np.linspace(0, 50, 40)
plt.hist(df[df['label']=='ham']['punct%'], bins, alpha=0.5, label='ham')
plt.hist(df[df['label']=='spam']['punct%'], bins, alpha=0.5, label='spam')
plt.legend(loc='upper right')

### Data Cleaning & preprocessing
* Removing the number, punctuation & other characters
* Lowerig the sentences
* Stemming
* Removing stop-words

In [None]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

### Creating the TF - IDF model

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

y=pd.get_dummies(df['label'])
y=y.iloc[:,1].values

### Adding other feature like length of a sentence and percentage of punctuation in a sentence to the model

In [None]:
X_features = pd.concat([df['length'],df['punct%'],pd.DataFrame(X)], axis=1)
X_features.head()

### Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size = 0.20, random_state = 0)

### Training model using Naive bayes classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

### Prediction on test dataset

In [None]:
y_pred=spam_detect_model.predict(X_test)

### Accuracy checking by confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test,y_pred)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)

accuracy

### Creating Bag of words model to check the accuracy

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

y=pd.get_dummies(df['label'])
y=y.iloc[:,1].values

X_features = pd.concat([df['length'],df['punct%'],pd.DataFrame(X)], axis=1)
X_features.head()


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size = 0.20, random_state = 0)

# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

y_pred=spam_detect_model.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test,y_pred)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)

accuracy

### As we can see BOW model gives more accuracy than TF - IDF model

In [None]:
from matplotlib import pyplot as plt

cf_train_matrix = confusion_matrix(y_test,y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(cf_train_matrix, annot=True, fmt='d')