In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
message= pd.read_csv("../input/spam.csv", encoding='ISO-8859-1')
message.head()


Now we shoud remove null value from dataset. 

In [None]:
message.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis= 1,inplace=True)

In [None]:
message.rename(columns={'v1':'label','v2':'messages'},inplace=True)
message.head()

**As we continue our analysis we want to start thinking about the features we are going to be using. This goes along with the general idea of feature engineering. The better your domain knowledge on the data, the better your ability to engineer more features from it. Feature engineering is a very large part of spam detection in general. I encourage you to read up on the topic!**

**Let's make a new column to detect how long the text messages are:
**

In [None]:
message['length'] = message['messages'].apply(len)
message.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
message['length'].plot(bins=50, kind = 'hist')

In [None]:
message.hist(columns='length',by='label', bins= 50 )

 **Text - Preprocessing**

In [None]:
from nltk.corpus import stopwords
import string

In [None]:
stopwords.words('english')[:10]

In [None]:
def text_preprocess(mess):
    nonpuc = [char for char in mess if char not in string.punctuation]
    nonpuc = ''.join(nonpuc)
    
    return [word for word in nonpuc.split() if word.lower not in stopwords.words('english')]

In [None]:
text_preprocess("my name is it has  Mayur.")

In [None]:
message['messages'].head(5).apply(text_preprocess)

**Vectorization**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_trans = CountVectorizer(analyzer=text_preprocess).fit(message['messages'])
print(len(bow_trans.vocabulary_))

In [None]:
messages_bow = bow_trans.transform(message['messages'])

In [None]:
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

**After the counting, the term weighting and normalization can be done with TF-IDF, using scikit-learn's TfidfTransformer.**

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)

In [None]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

**Training a model**

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(messages_tfidf, message['label'])

In [None]:
from sklearn.metrics import classification_report
all_predictions = spam_detect_model.predict(messages_tfidf)
print (classification_report(message['label'], all_predictions))

**You should never actually evaluate on the same dataset you train on!**

**Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split

msg_train, msg_test, label_train, label_test = \
train_test_split(message['messages'], message['label'], test_size=0.3)

**Creating a Data Pipeline**

Let's run our model again and then predict off the test set. We will use SciKit Learn's pipeline capabilities to store a pipeline of workflow. This will allow us to set up all the transformations that we will do to the data for future use. Let's see an example of how it works:

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_preprocess)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [None]:
pipeline.fit(msg_train,label_train)
predictions = pipeline.predict(msg_test)
print(classification_report(predictions,label_test))