In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploratory Data Analysis (EDA)

Here I will be performing EDA on our spam/ham dataset.

In [None]:
import pandas as pd


df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv',encoding='latin-1')
df.head()

In [None]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df = df.rename(columns={"v1":"labels", "v2":"text"})
df.head()

In [None]:
df.describe()

In [None]:
df.labels.value_counts()

In [None]:
df.labels.value_counts().plot.bar()

In [None]:
# Replacing spam with 1 and ham with 0
df['spam']=df['labels']
for i,j in df.iterrows():
    # i is index
    # j is (labels, text)
    if j['labels']=='ham':
        j['spam'] = 0
    else:
        j['spam']=1

In [None]:
df.head()

# Pre-processing of SMS

This task involves :<br>
1. Tokenization
2. Vectorization
3. TF-IDF resemblency

## Removal of punctuations and stop-words

### Punctuations

In [None]:
import string
print(string.punctuation)

### Stop-words

Stop words are words like “and”, “the”, “him”, which are presumed to be uninformative in representing the content of a text,
and which may be removed to avoid them being construed as signal for prediction.

In [None]:
from nltk.corpus import stopwords
print(stopwords.words('english')[10:15])

In [None]:
def punctuation_stopwords_removal(sms):
    # filters charecter-by-charecter : ['h', 'e', 'e', 'l', 'o', 'o', ' ', 'm', 'y', ' ', 'n', 'a', 'm', 'e', ' ', 'i', 's', ' ', 'p', 'u', 'r', 'v', 'a']
    remove_punctuation = [ch for ch in sms if ch not in string.punctuation]
    # convert them back to sentences and split into words
    remove_punctuation = "".join(remove_punctuation).split()
    filtered_sms = [word.lower() for word in remove_punctuation if word.lower() not in stopwords.words('english')]
    return filtered_sms

In [None]:
print(punctuation_stopwords_removal("Hello we need to send this report by EOD.!!! yours sincerely, Purva"))

In [None]:
print(df.head())

# Analysis of most common words in spam and ham SMS

Here, we will be making use of `collections.Counter`.

In [None]:
from collections import Counter

data_ham = df[df['spam']==0].copy()
data_spam = df[df['spam']==1].copy()


In [None]:
print(data_ham[:2])
print(data_spam[:2])

In [None]:
data_ham.loc[:, 'text'] = data_ham['text'].apply(punctuation_stopwords_removal)
print(data_ham[:1])

In [None]:
words_data_ham = data_ham['text'].tolist()

In [None]:
words_data_ham[:3]

In [None]:
data_spam.loc[:, 'text']=data_spam['text'].apply(punctuation_stopwords_removal)
print(data_spam[:1])
#words_data_spam = data_spam['text'].tolist()

In [None]:
words_data_spam = data_spam['text'].tolist()
print(words_data_spam[:2])

In [None]:
ham_list = []
for sublist in words_data_ham:
    for word in sublist:
        ham_list.append(word)

spam_list = []
for sublist in words_data_spam:
    for word in sublist:
        spam_list.append(word)

In [None]:
ham_count = Counter(ham_list)
spam_count = Counter(spam_list)

ham_top_30_words = pd.DataFrame(ham_count.most_common(30), columns=['word', 'count'])
spam_top_30_words = pd.DataFrame(spam_count.most_common(30), columns=['word', 'count'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x='word', y='count', 
            data=ham_top_30_words, ax=ax)
plt.title("Top 30 Ham words")
plt.xticks(rotation='vertical');

In [None]:

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x='word', y='count', 
            data=spam_top_30_words, ax=ax)
plt.title("Top 30 Spam words")
plt.xticks(rotation='vertical');

# BOW with CountVectorizer

In this scheme, features and samples are defined as follows: each individual token occurrence frequency (normalized or not) is treated as a feature.
the vector of all the token frequencies for a given document is considered a multivariate sample.<br>
A corpus of documents can thus be represented by a matrix with one row per document and one column per token (e.g. word) occurring in the corpus.
We call vectorization the general process of turning a collection of text documents into numerical feature vectors.<br>
This specific strategy (tokenization, counting and normalization) is called the Bag of Words or “Bag of n-grams” representation.
Documents are described by word occurrences while completely ignoring the relative position information of the words in the document.

<img src="https://github.com/purvasingh96/Talking-points-global-hackathon/blob/master/assets/word2vec_architectures.png?raw=1" width="500"></img>

In this kernel we apply the CountVectorizer from sklearn as BOW model. : [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(analyzer=punctuation_stopwords_removal).fit(df['text'])

In [None]:
len(bow_transformer.vocabulary_)

In [None]:
sample_spam = df['text'][8]
bow_sample_spam = bow_transformer.transform([sample_spam])
print(sample_spam)
print(bow_sample_spam)

In [None]:
print('Printing bag of words for sample 1')
row, cols = bow_sample_spam.nonzero()
for col in cols:
    print(bow_transformer.get_feature_names()[col])

In [None]:
import numpy as np
print(np.shape(bow_sample_spam))

In [None]:
sample_ham = df['text'][4]
bow_sample_ham = bow_transformer.transform([sample_ham])
print(sample_ham)
print(bow_sample_ham)
rows, cols = bow_sample_ham.nonzero()
print('Printing ')
for col in cols:
    print(bow_transformer.get_feature_names()[col])

# TF-IDF on BOW

TF-IDF expects a bag-of-words (integer values) training corpus during initialization. During transformation, it will take a vector and return another vector of the same dimensionality.<br>

TF-IDF stands for "Term Frequency, Inverse Document Frequency".<br>

* It is a way to score the importance of words (or "terms") in a document based on how frequently they appear across multiple documents.
* If a word appears frequently in a document, it's important. Give the word a high score. But if a word appears in many documents, it's not a unique identifier. Give the word a low score.<br>

* Therefore, common words like *"the"* and *"for"*, which appear in many documents, will be scaled down. Words that appear frequently in a single document will be scaled up.<br>

In other words:
* TF(w) = `(Number of times term w appears in a document) / (Total number of terms in the document).`
* IDF(w) = `log_e(Total number of documents / Number of documents with term w in it).`
For example
Consider a document containing 100 words wherein the word 'tiger' appears 3 times.
* The term frequency (i.e., tf) for 'tiger' is then:<br>
    TF = (3 / 100) = 0.03.
* Now, assume we have 10 million documents and the word 'tiger' appears in 1000 of these. Then, the inverse document frequency (i.e., idf) is calculated as:<br>
`IDF = log(10,000,000 / 1,000) = 4.`
Thus, the Tf-idf weight is the product of these quantities:
TF-IDF = 0.03 * 4 = 0.12.

### TfidfTransformer from sklearn

Both tf and tf–idf can be computed as follows using sklearn's [TfidfTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

# bag of words in vectorized format
bow_data = bow_transformer.transform(df['text'])
print(bow_data[:1])
tfidf_transformer = TfidfTransformer().fit(bow_data)

In [None]:
tfidf_sample_ham = tfidf_transformer.transform(bow_sample_ham)
print('Sample HAM : ')
print(tfidf_sample_ham)

tfidf_sample_spam = tfidf_transformer.transform(bow_sample_spam)
print('Sample SPAM : ')
print(tfidf_sample_spam)

In [None]:
final_data_tfidf = tfidf_transformer.transform(bow_data)
print(final_data_tfidf)
print(np.shape(final_data_tfidf))

### Train test split


In [None]:
from sklearn.model_selection import train_test_split

data_tfidf_train, data_tfidf_test, label_train, label_test = train_test_split(final_data_tfidf, df["spam"], test_size=0.3, random_state=5)

### Results Visualization Methods


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred):
    mtx = confusion_matrix(y_true, y_pred)
    #fig, ax = plt.subplots(figsize=(4,4))
    sns.heatmap(mtx, annot=True, fmt='d', linewidths=.5,  
                cmap="Blues", square=True, cbar=False)
    #  
    plt.ylabel('true label')
    plt.xlabel('predicted label')

### Naive Bayes Classifier for Spam/Ham Classification 

Here we will be using Naive Bayes' [MultinomialNB](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html) to classify emails into spam/ham category. <br>One important thing to note in this part of coding section is that numpy didnt manage to figure out that datatype of `label_train` was float64 and by default it set the datatypt to a generic object.<br>
In order to solve this issue, we need to explicitly define dataype of `label_train` as `np.asarray(label_train, dtype="float64")`.

#### Results

Upon applying NaiveBayes Classifier, we have achieved 96.5% accuracy.<br>
Upon analysis of ROC charecterstics, we have achieved 97.698 as area under the curve (auc)

In [None]:
data_tfidf_train = data_tfidf_train.A
data_tfidf_test = data_tfidf_test.A

In [None]:
print(data_tfidf_train.dtype)
print(label_train.dtype)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

spam_detect_model_MNB = MultinomialNB()
spam_detect_model_MNB.fit(data_tfidf_train, np.asarray(label_train, dtype="float64"))
pred_test_MNB = spam_detect_model_MNB.predict(data_tfidf_test)
acc_MNB = accuracy_score(np.asarray(label_test, dtype="float64"), pred_test_MNB)
print(acc_MNB)

#### ROC Curve

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thr = roc_curve(np.asarray(label_test, dtype="float64"), spam_detect_model_MNB.predict_proba(data_tfidf_test)[:,1])
plt.figure(figsize=(5, 5))
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Plot')
auc_knn4 = auc(fpr, tpr) * 100
plt.legend(["AUC {0:.3f}".format(auc_knn4)]);

#### Confusion Matrix


In [None]:
def plot_confusion_matrix(y_true, y_pred):
    mtx = confusion_matrix(y_true, y_pred)
    #fig, ax = plt.subplots(figsize=(4,4))
    sns.heatmap(mtx, annot=True, fmt='d', linewidths=.5,  
                cmap="Blues", square=True, cbar=False)
    #  
    plt.ylabel('true label')
    plt.xlabel('predicted label')

In [None]:
plot_confusion_matrix(np.asarray(label_test, dtype="float64"), pred_test_MNB)