In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
messages = pd.read_csv('../input/spam.csv', delimiter = ',', encoding='latin-1')
messages.head()

Drop the column that are not required.


In [None]:
messages.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
messages.info()

In [None]:
sns.countplot(messages.v1)
plt.xlabel('Label')
plt.title('Number of ham and spam messages')

In [None]:
messages.head()

# Exploratory Data Analysis(EDA)

In [None]:
messages.describe()

In [None]:
messages.groupby('v1').describe()

now we have to start thinking about the number of features or we call as feature
engineering, better than knowledge abaout the data, better we will have the
ability to feature the data. 

add one more colummn related to the length of the messages. 

In [None]:
messages['length'] = messages['v2'].apply(len)

In [None]:
messages.head()

# Data Visualization

In [None]:
messages['length'].plot.hist(bins=50)

In [None]:
messages['length'].describe()

looks like highest length size of the message is 910 characters. 

lets try to explore is the length of the message is the distinguish feature between spam or ham messages. 

In [None]:
messages.hist(column='length', by='v1', bins=60, figsize=(12,4));

spam messages has more number of characters as compared to Ham Messages. 

# Remove Punctuation & Stopwords 

lets do some Text Pre-Processing. if we need to do some classification algorithm
then we need some numerical vector values. in order to convert word into vector we will use Bag of words. 

In [None]:
import string
string.punctuation

In [None]:
from nltk.corpus import stopwords
stopwords.words("english")[100:110]

In [None]:
mess = 'Sample Message! Notice: it has punctuation.'
nopunc = [c for c in mess if c not in string.punctuation]

In [None]:
nopunc = "".join(nopunc)
nopunc

In [None]:
nopunc.split()

In [None]:
clean_mess = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
clean_mess

Lets apply the same to all messages in the data frame


In [None]:
def text_process(mess):
    '''
    1.remove punctuaton
    2.remove stop words
    3. return list of clean text words
    '''
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc="".join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    

In [None]:
messages.head()

In [None]:
messages = messages.rename(columns={"v1": "label", "v2": "message"})

In [None]:
messages['spam'] = messages['label'].map({'spam': 1, 'ham': 0}).astype(int)
messages.head(10)

# apply Tokenization

In [None]:
messages['message'].head(5).apply(text_process)

Currently all the messages are as list of tokens and now we need to covert each of those messages into a vector the Scikit Learn's models can work with.
we'll do that in three steps using bag-of-words model:

**1. Count how many times does a word occur in each message (known as term frequency).**

**2. Weigh the counts, so that frequent tokens get lower weight (inverse document frequency).**

**3. Normalize the vector to unit length, to abstract from the original text length (L2 norm).**

* Let's begin with the first step: *

** Each vector will have as many dimensions as there are unique words in the SMS corpus. We will use Scikit's Learn CountVectorizer. This model will convert a collection of text documents to a matrix of token counts.**

** we can imagine this as a 2-Dimensional matrix. where the 1-dimension is the entire vocabulary(1 row per word) and the other dimension are the actual documents, in this case a column per text message.**

For example:

**|            |Message 1|Message 2|...|Message N|**
**|Word 1 Count|0        |1        |...|0        |**
**|Word 2 Count|0        |1        |...|0        |**
**|...         |         |...      |...|...      |**
**|Word N Count|0        |1        |...|1        |**

Since there are so many messages, we can expect a lot of zero counts for the presence of that word in that document.
Beacuse of this, scikit learn will output a  Sparse Matrix (https://en.wikipedia.org/wiki/Sparse_matrix)

# CountVectorizer(bag-of-word)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(messages['message'])

In [None]:
print(len(bow_transformer.vocabulary_))

In [None]:
# grab the 4th message

mess4 = messages['message'][3]
print(mess4)

In [None]:
bow4 = bow_transformer.transform([mess4])

In [None]:
print(bow4)

In [None]:
print(bow4.shape)

In [None]:
bow_transformer.get_feature_names()[3996]

In [None]:
bow_transformer.get_feature_names()[9445]

In [None]:
# apply to whole dataframe
messages_bow = bow_transformer.transform(messages['message'])

In [None]:
print('Shape of Sparse Matrix', messages_bow.shape)

In [None]:
messages_bow.nnz  # non zero occurance

# Vectorizing Data: TF-IDF

** Weight and Normalization is done using TF-IDF **

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)

In [None]:
tfidf4 = tfidf_transformer.transform(bow4)

In [None]:
print(tfidf4) # weight values for each of the word 

In [None]:
messages_tfidf = tfidf_transformer.transform(messages_bow)

**message are finaly converted into vector**

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier().fit(messages_tfidf, messages['label'])

In [None]:
model.predict(tfidf4)

In [None]:
messages['label'][3]

In [None]:
all_pred = model.predict(messages_tfidf)

In [None]:
all_pred

**proper way to do is train-test-split**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.3, random_state=42)

**Pipeline helps to save the complete workflow**

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier())
])

In [None]:
pipeline.fit(msg_train, label_train)

In [None]:
predictions = pipeline.predict(msg_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(label_test, predictions))

https://towardsdatascience.com/natural-language-processing-nlp-for-machine-learning-d44498845d5b