In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
%matplotlib inline 

In [None]:
raw_data = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv',sep=',', encoding='latin-1')

In [None]:
raw_data.head()

In [None]:
raw_data.columns


In [None]:
raw_data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [None]:
raw_data.head(4)

In [None]:
df = raw_data.copy()

In [None]:
df.info()

In [None]:
df.head(2)

In [None]:
df.tail(2)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.groupby('v1').describe().T

In [None]:
df.rename(columns={"v1":"lable", "v2":"messages"}, inplace=True)

In [None]:
df.head()

In [None]:
df['length'] = df['messages'].apply(len)

In [None]:
df.head()

In [None]:
sns.set_style('darkgrid')
df['length'].plot.hist(bins=100, color='red')

In [None]:
df['length'].describe()

In [None]:
df[df['length'] == 910]['messages'].iloc[0]

In [None]:
sns.set_style('darkgrid')
df.hist(column='length', by='lable', bins=100, figsize=(12,5))

In [None]:
import string
from nltk.corpus import stopwords

In [None]:
def remove_pucn(txt):
    """
    1. remove punctuation 
    2. remove stopwords 
    3. retrun clean text in list format
    """
    no_punc = [t for t in txt if t not in string.punctuation]
    no_punc = ''.join(no_punc)
    return [w for w in no_punc.split() if w.lower() not in stopwords.words('english') ]

In [None]:
df.head(4)

In [None]:
df['messages'].apply(remove_pucn)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bag_of_words_transf = CountVectorizer(analyzer=remove_pucn).fit(df['messages'])

In [None]:
bag_of_words_transf.vocabulary_

In [None]:
print(bag_of_words_transf.vocabulary_)

In [None]:
msg50 = df['messages'][49]

In [None]:
print(msg50)

In [None]:
bow50 = bag_of_words_transf.transform([msg50])

In [None]:
print(bow50)

In [None]:
bow50.shape

In [None]:
bag_of_words_transf.get_feature_names()[4777]

In [None]:
features = bag_of_words_transf.get_feature_names()

In [None]:
features = pd.DataFrame(features)

In [None]:
type(features)

In [None]:
features.head(3)

In [None]:
msg_bow = bag_of_words_transf.transform(df['messages'])

In [None]:
msg_bow.shape

In [None]:
## Non Zero Messages
msg_bow.nnz

In [None]:
sparsity = (100.0 * msg_bow.nnz / (msg_bow.shape[0] * msg_bow.shape[1]))
print('sparsity: {}'.format(sparsity))

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tf_idf_trans = TfidfTransformer().fit(msg_bow)

In [None]:
tfidf50 = tf_idf_trans.transform(bow50)

In [None]:
print(tfidf50)

In [None]:
tf_idf_trans.idf_[bag_of_words_transf.vocabulary_['actor']]

In [None]:
msg_tf_idf = tf_idf_trans.transform(msg_bow)

In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB


In [None]:
multiNB = MultinomialNB().fit(msg_tf_idf,df['lable'])


In [None]:
multiNB.predict(tfidf50)

In [None]:
gaussianNB = GaussianNB().fit(msg_tf_idf.toarray(), df['lable'])


In [None]:
gaussianNB.predict(tfidf50.toarray())

In [None]:
bernNB = BernoulliNB().fit(msg_tf_idf, df['lable'])

In [None]:
bernNB.predict(bow50)

In [None]:
multiN_all_pred = multiNB.predict(msg_tf_idf)

In [None]:
gaussian_all_pred = gaussianNB.predict(msg_tf_idf.toarray())

In [None]:
bernNB_all_pred = bernNB.predict(msg_tf_idf)

In [None]:
print('Multinormial Naive Bayes all prediction')
print(multiN_all_pred)
print('\n'*2)

print('Gaussian Naive Bayes all prediction')
print(gaussian_all_pred)
print('\n'*2)

print('Bernoulli Naive Bayes all prediction')
print(bernNB_all_pred)
print('\n'*2)


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
msg_train, msg_test, label_train, label_test = train_test_split(df['messages'], df['lable'], test_size =.30)

In [None]:
msg_train.shape , msg_test.shape

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipeLine = Pipeline([('bow', CountVectorizer(analyzer=remove_pucn)),
                     ('tfidf', TfidfTransformer()),
                     ('classifier', MultinomialNB())
                    ])

In [None]:
pipeLine.fit(msg_train, label_train)

In [None]:
pred = pipeLine.predict(msg_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
print(classification_report(label_test, pred))

In [None]:
print(confusion_matrix(label_test, pred))

In [None]:
print(accuracy_score(label_test, pred))