In [None]:
import re
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

ps = PorterStemmer()
wordnet = WordNetLemmatizer()

In [None]:
message_df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv")
message_df.head()

In [None]:
message_df.columns

In [None]:
# Dropping the redundent looking collumns (for this project)
to_drop = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
message_df = message_df.drop(message_df[to_drop], axis=1)
# Renaming the columns because I feel fancy today 
message_df.rename(columns = {"v1":"label", "v2":"message"}, inplace = True)
message_df.head()

In [None]:
message_df.shape

In [None]:
message_df["message"][0]

In [None]:
message_df["label"].value_counts()

In [None]:
sns.countplot(x= message_df["label"])
plt.show()

In [None]:
message_df[message_df["label"] == "spam"].head(10)

## Text Preprocessing 

In [None]:
sentences = message_df["message"]
sentences

In [None]:
def preprocess_text_stemming(sentences):
    '''
    Removing Stop Words, Punctuation, Numbers and Performing Stemming
    '''
    corpus = []
    for i in range(len(sentences)):
        review = re.sub("[^a-zA-Z]"," ", sentences[i])
        review = review.lower()
        words = review.split()
        words = [ps.stem(word) for word in words if not word in set(stopwords.words("english"))]
        new_sentence = " ".join(words)
        corpus.append(new_sentence)
        
    return corpus

def preprocess_text_lemma(sentences):
    '''
    Removing Stop Words, Punctuation, Numbers and Performing lemmatization
    '''
    corpus = []
    for i in range(len(sentences)):
        review = re.sub("[^a-zA-Z]"," ", sentences[i])
        review = review.lower()
        words = review.split()
        words = [wordnet.lemmatize(word) for word in words if not word in set(stopwords.words("english"))]
        new_sentence = " ".join(words)
        corpus.append(new_sentence)
        
    return corpus

In [None]:
%%time
# cleaned_corpus = preprocess_text_stemming(sentences)
cleaned_corpus = preprocess_text_lemma(sentences)
print(len(cleaned_corpus))
cleaned_corpus[:5]

In [None]:
words = nltk.word_tokenize(" ".join(cleaned_corpus))
print("Total unique Words: ",len(set(words)))
words[:10]

In [None]:
print("Total Words: ", len(words))
print("Total unique Words: ",len(set(words)))

## BOW

In [None]:
cv = CountVectorizer(max_features = 2500) # considering only the top 2500 features only
X = cv.fit_transform(cleaned_corpus).toarray()
X

In [None]:
X.shape

In [None]:
y = pd.get_dummies(message_df["label"])
y = y.iloc[:,1].values # Considering only one value, since from its we can predict the next

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state = 41)

print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_test: ", y_test.shape)

## Modelling

In [None]:
model = MultinomialNB().fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

## Validation

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))