In [101]:
import pandas as pd

In [102]:
df = pd.read_csv("C:/Users/rishikesh/sms.csv", sep='\t', header = None, names = ['label', 'message'])

In [129]:
df.describe()

Unnamed: 0,label
count,5572.0
mean,0.134063
std,0.340751
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label      5572 non-null int64
message    5572 non-null object
dtypes: int64(1), object(1)
memory usage: 87.1+ KB


In [103]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


As scikit works with numerical data, so replacing the strings with numericals i.e. 0 for 'ham' and 1 for 'spam'

In [104]:
df['label'] = df.label.map({'ham' :0, 'spam':1})

Now that the strings in label are replaced by numbers, procced to splitting the data into training dataset and testing dataset.

In [105]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'])

In [106]:
X_train.head()

4553    SYMPTOMS when U are in love: "1.U like listeni...
1524                                      Yup ok thanx...
4450    Urgent UR awarded a complimentary trip to Euro...
4112    URGENT! Your Mobile number has been awarded a ...
975                              Eh u send wrongly lar...
Name: message, dtype: object

In [107]:
X_test.shape

(1393,)

In [108]:
y_train.shape

(4179,)

In [109]:
y_test.shape

(1393,)

In [110]:
y_test.head()

4899    0
4360    0
4276    0
2250    1
2709    0
Name: label, dtype: int64

Using CountVectorizer to create BagOfWords, which contain all the words and the frequency of their occurence with respect to the document

In [132]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer( analyzer ='word', token_pattern=r'\b[^\d\W]+\b', stop_words = 'english')

Here, regular expression (token_pattern=r'\b[^\d\W]+\b') is used as a parameter to ensure that all the numericals are deleted from the message as the numbers(digits) in message are irrelevant with the context i.e. whether the message is ham or spam 

In [112]:
count_vector

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[^\\d\\W]+\\b',
        tokenizer=None, vocabulary=None)

Now, let's fit the data to count_vector object

In [134]:
training_data = count_vector.fit_transform(X_train).toarray()
testing_data = count_vector.transform(X_test).toarray()

Before proceeding further lets confirm if we have correctly crated a matrix 

In [136]:
doc_array = count_vector.transform(X_train).toarray()
doc_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

Let's create a frequency_matrix to verify that the words are separated well and that frequency is rightly calculated

In [137]:
frequency_matrix = pd.DataFrame(doc_array, columns = count_vector.get_feature_names())
frequency_matrix.head()

Unnamed: 0,_,aah,aaniye,aaooooright,aathi,ab,abbey,abeg,abel,aberdeen,...,zoe,zogtorius,zoom,zouk,zyada,èn,é,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
from sklearn.naive_bayes import MultinomialNB

In [139]:
naive_bayes = MultinomialNB()

In [140]:
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [141]:
y_pred = naive_bayes.predict(testing_data)

In [142]:
y_pred.shape

(1393,)

In [143]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [144]:
print("Accuracy : ",format(accuracy_score(y_test,y_pred)))

Accuracy :  0.9842067480258435


In [145]:
print("Precision : ", format(precision_score(y_test, y_pred)))

Precision :  0.9479166666666666


In [146]:
print("Recall score : " , format(recall_score(y_test,y_pred)))

Recall score :  0.9381443298969072


In [147]:
print("f1_score : ", format(f1_score(y_test,y_pred)))

f1_score :  0.9430051813471502
