# SMS Messages Prediction

# Reading Data

In [2]:
import pandas as pd
data = pd.read_csv("SMSData_Final.csv", nrows = 6000)
title = data['Title']
messages = data['Message']
print(data)

     Title                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
5     spam  FreeMsg Hey there darling it's been 3 week's n...
6      ham  Even my brother is not like to speak with me. ...
7      ham  As per your request 'Melle Melle (Oru Minnamin...
8     spam  WINNER!! As a valued network customer you have...
9     spam  Had your mobile 11 months or more? U R entitle...
10     ham  I'm gonna be home soon and i don't want to tal...
11    spam  SIX chances to win CASH! From 100 to 20,000 po...
12    spam  URGENT! You have won a 1 week FREE membership ...
13     ham  I've been searching for the right words to tha...
14     ham                I HAVE A DATE ON SUNDAY WITH WILL!!
15    sp

# Cleaning Data

In [4]:
import pip
#pip.main(['install', 'bs4'])
#pip.main(['install', 'ntlk'])

In [5]:
import re
from bs4 import BeautifulSoup 
import nltk
from nltk.corpus import stopwords

In [6]:
def review_to_words( raw_review ):

    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review, "html5lib").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words )) 

In [7]:
num_messages = messages.size
print "Cleaning and parsing the training set messages...\n"
clean_train_messages = []
for i in xrange( 0, num_messages):
    if( (i+1)%1000 == 0 ):
        print "Review %d of %d\n" % ( i+1, num_messages )                                                                    
    clean_train_messages.append( review_to_words( messages[i] ))


Cleaning and parsing the training set messages...

Review 1000 of 5574

Review 2000 of 5574

Review 3000 of 5574

Review 4000 of 5574

Review 5000 of 5574



# Creating bag of word

In [9]:
print "Creating the bag of words...\n"
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 3000) 
train_data_features = vectorizer.fit_transform(clean_train_messages[0:4460])
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [10]:
print train_data_features.shape
print train_data_features

vocab = vectorizer.get_feature_names()

(4460L, 3000L)
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [11]:
Y=title.values.flatten()


In [12]:
print(Y)

['ham' 'ham' 'spam' ..., 'ham' 'ham' 'ham']


In [13]:
from sklearn.preprocessing import LabelEncoder
data = data.apply(LabelEncoder().fit_transform)
message_title = data['Title']

# Train and Test Data

In [15]:

from sklearn.model_selection import train_test_split
## Split data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(train_data_features, message_title[0:4460], random_state=42, test_size = 0.2)
#print(X_test)

# Applying KNN

In [17]:
from sklearn import neighbors

knn = neighbors.KNeighborsClassifier(n_neighbors=5)
## Fit the model on the training data.
knn.fit(train_data_features, message_title[0:4460])
## See how the model performs on the test data.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

## Accuracy of KNN 

In [40]:
Accuracy_KNN= knn.score(X_test, y_test)*100
print(Accuracy_KNN)

93.9461883408


# Applying DicisionTree 

In [21]:
from sklearn import tree
import numpy as np
clf = tree.DecisionTreeClassifier()
clf.fit(train_data_features, np.array(Y[0:4460]))


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [22]:
tX = vectorizer.transform(clean_train_messages[4461:5574]).toarray()
# prediction
Y_pred1 = clf.predict(tX)
print(Y.shape)

(5574L,)


In [23]:
Y[4461:5574]
Y_pred1

array(['ham', 'ham', 'ham', ..., 'ham', 'spam', 'ham'], dtype=object)

## Accuracy of DicisionTree 

In [25]:
from sklearn.metrics import accuracy_score
Accuracy_DT =accuracy_score(Y[4461:5574], Y_pred1)*100
print(Accuracy_DT)

97.1248876909


In [26]:
#Naive bayes
Y=message_title.values.flatten()

# Applying Naive Bayes 

In [28]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0.00001) # alpha=0 means no laplace smoothing
clf.fit(train_data_features, np.array(Y[0:4460]))

MultinomialNB(alpha=1e-05, class_prior=None, fit_prior=True)

In [29]:
tX = vectorizer.transform(clean_train_messages[4461:5574]).toarray()
# prediction
Y_pred = clf.predict(tX)
print Y_pred

[0 0 0 ..., 0 0 0]


In [30]:
Y[4461:5574]
Y_pred


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## Accuracy of Naive Bayes 

In [32]:
from sklearn.metrics import accuracy_score
Accuracy_NB = accuracy_score(Y[4461:5574], Y_pred)*100
print(Accuracy_NB)

97.6639712489


# Applying Logistic Regression 

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
#logreg = LogisticRegression(random_state=0)
logreg = linear_model.LogisticRegression(random_state=0)
logreg.fit(train_data_features, np.array(Y[0:4460]))

In [35]:
tX = vectorizer.transform(clean_train_messages[4461:5574]).toarray()
Y_pred = logreg.predict(tX)
print Y_pred


[0 0 0 ..., 0 0 0]


## Accuracy of Logistic Regression

In [37]:
Y[4461:5574]
Y_pred
from sklearn.metrics import accuracy_score
Accuracy_LR=accuracy_score(Y[4461:5574], Y_pred)*100
print(Accuracy_LR)


98.1132075472


# Applying SVM

In [45]:
from sklearn import svm
clf=svm.SVC()
clf.fit(train_data_features, np.array(Y[0:4460]))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
tX = vectorizer.transform(clean_train_messages[4461:5574]).toarray()


In [46]:
Obt_label=clf.predict(tX)

## Accuracy Of SVM

In [48]:
from sklearn.metrics import accuracy_score
Accuracy_SVM = accuracy_score(Y[4461:5574], Obt_label)*100
print(Accuracy_SVM)

87.0619946092


# Random Forest 

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_data_features, np.array(Y[0:4460]))



#print(clf.feature_importances_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [56]:
tx = vectorizer.transform(clean_train_messages[4461:5574]).toarray()
Obt=clf.predict(tx)

## Accuracy of Random Forest

In [57]:
from sklearn.metrics import accuracy_score
Accuracy_RandomForest= accuracy_score(Y[4461:5574], Obt)*100
print(Accuracy_RandomForest)

87.6909254268


# MultiLayer Perceptron

In [76]:
from sklearn.neural_network import MLPClassifier

clf_ = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(2, 2), random_state=1)

clf_.fit(train_data_features, np.array(Y[0:4460]))

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(2, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [77]:
t_x = vectorizer.transform(clean_train_messages[4461:5574]).toarray()
Ob=clf_.predict(t_x)

## Accuracy of Multilayer Perception

In [78]:
from sklearn.metrics import accuracy_score
Accuracy_ANN= accuracy_score(Y[4461:5574], Ob)*100
print(Accuracy_ANN)

97.9335130279


# Gausian Naive bayes

In [80]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(train_data_features, np.array(Y[0:4460]))


GaussianNB(priors=None)

In [81]:
t_x = vectorizer.transform(clean_train_messages[4461:5574]).toarray()
ob=clf.predict(t_x)

## Accuracy of Gausian Naive Bayes

In [82]:
Accuracy_GNB= accuracy_score(Y[4461:5574], ob)*100
print(Accuracy_GNB)

85.1752021563


# Overall Accuracy Results of various Algorithms

In [79]:
Accuracy_Data = [Accuracy_LR, Accuracy_NB, Accuracy_KNN, Accuracy_DT, Accuracy_SVM, Accuracy_RandomForest,Accuracy_ANN]
print(Accuracy_Data)

[98.113207547169807, 97.663971248876919, 93.946188340807183, 97.124887690925434, 87.061994609164415, 87.690925426774484, 97.933513027852655]
