# Basic imports

In [None]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Read data from csv

**Pandas - [pd.read_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html)**

In [None]:
sms = pd.read_csv('../input/spam.csv',encoding='latin-1')

In [None]:
sms.head()

# Seems like the three last columns dont have any information

**Step one**: Remove the useless columns


**Step Two**: Rename the *v1* and *v2* columns to *class* and *text*, respectively

In [None]:
sms.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
sms.columns = ['Class','Text']

In [None]:
sms.head()

# Lets see some information about the data



In [None]:
sms['text_len'] = sms['Text'].map(len)

In [None]:
sns.factorplot('Class',data=sms,kind='count')

In [None]:
sms.groupby(['Class']).mean()

# Approach #1


Lets use the **Bag-of-Words** and **Naive Bayes** 

This will count the frequency of the words, which will be our features in this case

**[Udacity - Bag of Words - Intro to Machine Learning](https://www.youtube.com/watch?v=OGK9SHt8SWg)**


**[Sklearn - CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)**

**[Sklearn - Train-Test-Split](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)**

**[Sklearn - PipeLine](http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
def pre_processing(document):
    tokens = word_tokenize(document)
    return [word for word in tokens if word not in stopwords.words('english')]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    np.ravel(sms['Text']),
    np.ravel(sms['Class']),
    random_state=3
)

In [None]:
pipe = Pipeline([
    ('bow',CountVectorizer(analyzer=pre_processing)),
    ('classifier',MultinomialNB())    
])

In [None]:
pipe.fit(x_train,y_train)

In [None]:
predicts = pipe.predict(x_test)

In [None]:
print(classification_report(predicts,y_test))

# Approach #2

Use of **TD-IDF** and **Naive-Bayes**


**[Sklearn - TF-IDF](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)**


**[Udacity - Weighting by Term Frequency - Intro to Machine Learning](https://www.youtube.com/watch?v=t2Nq3MFK_pg) **

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
pipe2 = Pipeline([
    ('tdidf',TfidfVectorizer(analyzer=pre_processing)),
    ('classifier',MultinomialNB()),
])

In [None]:
pipe2.fit(x_train,y_train)

In [None]:
predict2 = pipe2.predict(x_test)

In [None]:
print(classification_report(predict2,y_test))