In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# **Loading and Understanding the Data**

specifying path for the file

In [None]:
PATH = '/kaggle/input/sms-spam-collection-dataset/spam.csv'

In [None]:
spam_data = pd.read_csv(f'{PATH}')

### We got an **UnicodeDecodeError**

### For resolving this error we have to know value encoding 

In [None]:
import chardet
with open(f'{PATH}', 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
print(result)

### Now wite the value of encoding inside the `read_csv()` 

In [None]:
spam_data = pd.read_csv(f'{PATH}', encoding='Windows-1252')

In [None]:
spam_data

In [None]:
spam_data.head()

### Now we have to check the null values count in each columns

In [None]:
spam_data.isnull()

In [None]:
spam_data.isnull().sum()

### **Remove the columns**

Since there are maximum number of values are null in three column. so, we drop all the three columns

In [None]:
spam_data.drop(spam_data[["Unnamed: 2"	, "Unnamed: 3"	,"Unnamed: 4"]], axis=1, inplace=True)

### Renaming the Column names

In [None]:
spam_data.rename(columns={"v1" : 'label', "v2" : 'message'}, inplace=True)

Now again we check the null values after deleting the column

In [None]:
spam_data.isnull().sum().sort_index()/len(spam_data)

In [None]:
spam_data

Till now we have fininshed our initial processing

## **One Hot Encode the target variable**

In [None]:
labels  = pd.get_dummies(spam_data['label'], drop_first=True)

In [None]:
labels

Now we have two DataFrame one labels with ecoded value and our initial DataFrame.So we have to combine both to make a single DataFrame

In [None]:
spam_data = pd.concat([spam_data, labels], axis=1)
spam_data

As, we can see that there are two columns of target value one in encoded value and one in string format, So we drop the string column of target value.

In [None]:
spam_data.drop("label", axis=1, inplace=True)

In [None]:
spam_data

Now, we convert the message column into list 

In [None]:
messages = spam_data['message'].to_numpy().tolist()

In [None]:
messages[0:3]

# **Text Preprocessing**

Text preprocessing is an approach for cleaning and preparing text data for use in a specific context. Developers use it in almost all natural language processing (NLP) pipelines, including voice recognition software, search engine lookup, and machine learning model training. It is an essential step because text data can vary. From its format (website, text message, voice recognition) to the people who create the text (language, dialect), there are plenty of things that can introduce noise into your data.

 We will use few common approaches for cleaning and processing text data. They include:

   * Using Regex & NLTK libraries
   * Removing unnecessary characters and formatting
   * Tokenization – break multi-word strings into smaller components
   * Normalization – a catch-all term for processing data; this includes stemming and lemmatization


In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

### Noise Removal

Text cleaning is a technique that developers use in a variety of domains. Depending on the goal of your project and where you get your data from, you may want to remove unwanted information, such as:

   * Punctuation and accents
   * Special characters
   * Numeric digits
   * Leading, ending, and vertical whitespace
   * HTML formatting


### Tokenization

A few common operations that require tokenization include:

   * Finding how many words or sentences appear in text
   * Determining how many times a specific word or phrase exists
   * Accounting for which terms are likely to co-occur


### Normalization

Tokenization and noise removal are staples of almost all text pre-processing pipelines. However, some data may require further processing through text normalization. Text normalization is a catch-all term for various text pre-processing tasks.A few of them:

  *  Upper or lowercasing
  *  Stopword removal
  *  Stemming – bluntly removing prefixes and suffixes from a word

### Stopword Removal

Stopwords are words that we remove during preprocessing when we don’t care about sentence structure. They are usually the most common words in a language and don’t provide any information about the tone of a statement. They include words such as “a”, “an”, and “the”.

### Stemming

In natural language processing, stemming is the text preprocessing normalization task concerned with bluntly removing word affixes (prefixes and suffixes). For example, stemming would cast the word “going” to “go”. This is a common method used by search engines to improve matching between user input and website hits.

In [None]:
stop_words = set(stopwords.words('english')) # for Stopword Removal
stemmer = PorterStemmer()  # for stemming

In [None]:
# Replacing Emial id's with the single string using regular expression

messages = [ re.sub(r"[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z]+[.]+[a-zA-Z]+[.]?[a-zA-Z]*", 'EMAILID', word) for word in messages ]

# Replacing web address with the single string using regular expression

messages = [ re.sub(r"https?:\/\/w{0,3}\w*?\.(\w*?\.)?\w{2,3}\S*|www\.(\w*?\.)?\w*?\.\w{2,3}\S*|(\w*?\.)?\w*?\.\w{2,3}[\/\?]\S*", 'WEBADDRESS', word) for word in messages ]

# Replacing Phone number with the single string using regular expression

messages = [ re.sub(r"\d{10}\d{0,9}", 'PHONENUMBER', word) for word in messages ]

In [None]:
# tokenizing the each sentence

tokenized_by_word = [ word_tokenize(message) for message in messages] 

In [None]:
# punctuations which are needed to be removed 

punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

In [None]:
def removePunctuations(word):
    text = ""
    for i in word:
        if i not in punctuations: # Noise Removal
            text += i.lower()     # Lowercasing the words
    
    return stemmer.stem(text)     # stemming the word

In [None]:
def wordFilter(sentence):
    message = []
    for word in sentence:
        cleaned_word =  removePunctuations(word)
        
        # checking the string wheather it is Stopword or not and also checking the spaces
        
        if cleaned_word.isspace() or cleaned_word == "" or cleaned_word in stop_words: 
            continue
        else:
            message.append(removePunctuations(word))
    return message

In [None]:
processed_message =  [ wordFilter(message) for message in tokenized_by_word]

In [None]:
processed_message[0:1]

In [None]:
def flatten(my_list):
  result = []
  for el in my_list:
    if isinstance(el, list):
      flat_list = flatten(el)
      result += flat_list
    else:
      result.append(el)
  return result

In [None]:
words_token = flatten(processed_message)

In [None]:
words_token[0:10]

In [None]:
# When building BoW vectors, we generally create a features dictionary

def create_features_dictionary(document_tokens):
  features_dictionary = {}
  index = 0
  for token in document_tokens:
    if token not in features_dictionary:
      features_dictionary[token] = index
      index += 1
  return features_dictionary

In [None]:

# Turning text into a BoW vector is known as feature extraction or vectorization. 

def tokens_to_bow_vector(document_tokens, features_dictionary):
  bow_vector = [0] * len(features_dictionary)
  for token in document_tokens:
    if token in features_dictionary:
      feature_index = features_dictionary[token]
      bow_vector[feature_index] += 1
  return bow_vector

In [None]:
message_dictionary = create_features_dictionary(words_token)

In [None]:
message_vector = [tokens_to_bow_vector(message, message_dictionary) for message in processed_message]

In [None]:
messages_label = spam_data['spam'].to_numpy()

# Building Models

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [None]:
spam_classifier = MultinomialNB()

dividing data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(message_vector, messages_label, test_size=0.20, random_state=42)

In [None]:
spam_classifier.fit(X_train, y_train)

In [None]:
predictions = spam_classifier.score(X_test, y_test)

Accuracy of test set

In [None]:
predictions * 100