In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

### The dataset contains one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw messages. 

In [None]:
# Impotant Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

#### Get the Data

In [None]:
data = pd.read_csv("../input/spam.csv", encoding='latin-1')

#### Let's check our dataset

In [None]:
data.head()

In [None]:
data.columns = ['label', 'message', 'line1', 'line2', 'line3']

In [None]:
data.info()

In [None]:
data.describe()

#### As we can see, the last three columns has huge number of null values. It'll better if I drop them.

In [None]:
data.drop(['line1','line2', 'line3'], axis=1, inplace=True)

In [None]:
data.head()

#### Exploring the data

In [None]:
data.groupby('label').describe()

#### As we can see there are many same messages are there.

#### I will add a feature of length of each message

In [None]:
data['length'] = data['message'].apply(len)
data.head()

### Data Visualization

In [None]:
data['length'].plot(bins=50, kind='hist', cmap='coolwarm')
plt.show()

#### From here we can see that there are some really long messages

In [None]:
data.length.describe()

#### Max message character is 910. Let's try to see that message

In [None]:
data[data['length'] == 910]['message'].iloc[0]

#### This is not a Spam message. Looks like a love letter sort of

In [None]:
data.hist(column='length', by='label', bins=50,figsize=(12,4))
plt.show()

#### Through this we can understand the trend that Spam messages tend to have more characters.

### Preprocessing

#### First I'll remove the common words, ('the', 'a', etc..) as they have very less importance for prediction. I will use NLTK library to do this. 
#### And then I will convert the raw messages (sequence of characters) into vectors (sequences of numbers).

#### Let's see some stopwords.

In [None]:
import nltk
import string
from nltk.corpus import stopwords
stopwords.words('english')[0:10]

In [None]:
def remove_stopword(mess):
    # Checking characters if they are in punctuation
    message = [char for char in mess if char not in string.punctuation]

    # Joining the characters 
    message = ''.join(message)
    
    # Removing any stopwords
    return [word for word in message.split() if word.lower() not in stopwords.words('english')]

#### Checking how it's working

In [None]:
data['message'].head(5).apply(remove_stopword)

In [None]:
data.head()

### Train Test split

In [None]:
from sklearn.model_selection import train_test_split

msg_train, msg_test, label_train, label_test = train_test_split(data['message'], data['label'], test_size=0.3, random_state=101)

print(len(msg_train), len(msg_test), len(msg_train) + len(msg_test))

### Creating a Pipeline

#### This will allow us to set up all the transformations that we will do to the data for future use. I am using Bag of Words (CountVectorizer), Tfidf (Term Frequency - Inverse Document Frequency) and Naive Bayes classifier. 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('vector', CountVectorizer(analyzer=remove_stopword)),  # strings to tokenized vectors
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors with Naive Bayes classifier
])

#### Now I can pass the data through Pipeline and it will do all the preprocessing

In [None]:
pipeline.fit(msg_train,label_train)

### Prediction and Evaluation

In [None]:
predictions = pipeline.predict(msg_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(label_test,predictions))
print("\n")
print(classification_report(label_test,predictions))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
accuracy = accuracy_score(label_test,predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))