### Importing all the required datasets.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


### Importing the dataset

In [4]:
dataset = pd.read_csv('../input/spam.csv',encoding='latin-1')

### Lets have a look at the imported data

In [5]:
dataset.head()

##### So as can be seen in the above dataset, we have in total 5 columns. Out of these 5, we can remove the last 3 as they are empty.


In [6]:
dataset = dataset.drop(['Unnamed: 2' , 'Unnamed: 3', 'Unnamed: 4'],axis=1)

In [7]:
dataset.head()

#### So we are left with only two columns. One of them, v1 is our target column and the other one is our sms column.
#### We can rename them if we want

In [11]:
dataset = dataset.rename(columns={'v1':'Target','v2':'SMS'})

#### Now we will focus on cleanng the SMS part of the dataset. So lets Begin


In [9]:
import nltk

In [10]:
#We need this to remove the unwanted characters
import re

In [12]:
# The PorterStemmer will give the stem word of each word in the sms column
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [13]:
from nltk.corpus import stopwords

In [14]:
corpus = []
# This is going to keep the cleaned text saved in it

### The code below will do the following:
#### 1. Remove any character that is not an alphabet and replace it with a space
#### 2. Convert it all to lower case
#### 3. Split the words and create a list
#### 4. Put the word in the list if it is not a stop word
#### 5. Join all the words together again
#### 6. Append them all into a corpus

In [15]:


for i in range(len(dataset['SMS'])):
 clean_sms = re.sub('[^a-zA-Z]',' ',dataset['SMS'][i])
 clean_sms = clean_sms.lower()
 clean_sms = clean_sms.split()
 clean_sms = [ps.stem(word) for word in clean_sms if not word in set(stopwords.words('english'))]
 clean_sms = ' '.join(clean_sms)
 corpus.append(clean_sms)


### The CountVectorizer will make a sparse matrix out of the given words with the columns being the words itself

In [17]:
from sklearn.feature_extraction.text import  CountVectorizer
cv = CountVectorizer()

### X will have the sparse matrix

In [18]:
X = cv.fit_transform(corpus).toarray()

In [19]:
X

### y will be the target dataset

In [20]:
y = dataset.iloc[:,0].values

In [21]:
y


### Encoding the values of y


In [22]:
for i in range(len(y)):
    if y[i] == 'ham':
        y[i] = 1
    else:
        y[i] = 0
    

In [23]:
y = y.astype(np.int64)

In [24]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 2)

### Here I have used two different algorithms to do the classification.
#### 1. Naive Bayes
#### 2. Random Forest
#### 3. SVC

### As can be seen, Random Forest is more accurate with an accuracy of 97%

In [25]:
from sklearn.naive_bayes import GaussianNB
gb = GaussianNB()

In [26]:
gb.fit(X_train,y_train)

In [27]:
y_pred = gb.predict(X_test)

In [28]:
from sklearn.metrics import confusion_matrix,classification_report
cn = confusion_matrix(y_test,y_pred)

In [29]:
print(cn)

In [30]:
print(classification_report(y_test,y_pred))

In [31]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 200)

In [32]:
rf.fit(X_train,y_train)

In [33]:
y_pred2 = rf.predict(X_test)


In [34]:
cn2 = confusion_matrix(y_test,y_pred2)

In [35]:
print(cn2)


In [36]:
print(classification_report(y_test,y_pred2))

In [37]:
from sklearn.svm import SVC
svc = SVC(kernel = 'sigmoid',gamma = 1.0)

In [38]:
svc.fit(X_train,y_train)

In [39]:
pred3 = svc.predict(X_test)

In [40]:
cn3 = confusion_matrix(y_test,pred3)

In [41]:
cn3

In [42]:
print(classification_report(y_test,pred3))