In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import nltk

import warnings
warnings.simplefilter(action='ignore')

In [5]:
df = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv',usecols=['v1','v2'],encoding='cp1252')

In [6]:
df.head(2)

In [7]:
df.shape

In [8]:
df.info()

In [9]:
df['v1'].value_counts()

- Imbalanced data

In [10]:
# renaming columns
df.rename(columns={'v1':'target','v2':'msg'},inplace=True)

In [11]:
df.head(2)

In [12]:
df['target'].value_counts()

In [13]:
df['target']=df['target'].apply(lambda x: 1 if x=='spam' else 0)

In [14]:
df.duplicated().sum()

In [15]:
df.drop_duplicates(keep='first',inplace=True)

In [16]:
df.reset_index(drop=True,inplace=True)

In [17]:
df.shape

# EDA

In [18]:
df['target'].value_counts().plot(kind='pie',labels=['Ham','Spam'],autopct='%.2f%%',ylabel='',title='Target Data')

- Imbalanced is data

-  creating a new feature of total characters in each msgs
- breaking on basis of each chars

In [19]:
df['msg_chars']=df['msg'].apply(len)

- creating a new feature of total words in a msg
- first converting string into list by splitting, and then counting the len of list which give the len of words
- breaking on basis of each words

In [20]:
# method 2
print(df['msg'].str.split().apply(len).head())

#method 2 : use this code, give better result then above
print(df['msg'].apply(lambda x : len(nltk.word_tokenize(x))).head())

- split function dividing in less words then nltk func
- so, we will use nltk

In [21]:
df['msg_words']=df['msg'].apply(lambda x : len(nltk.word_tokenize(x)))

In [22]:
# creating new feature for no of sentence in msg
df['msg_sentences']=df['msg'].apply(lambda x : len(nltk.sent_tokenize(x)))

In [23]:
df.head()

In [24]:
df.describe()

In [25]:
# Spam 
df[df['target']==1].describe().style.background_gradient()

In [26]:
# Ham
df[df['target']==0].describe().style.background_gradient(cmap='YlOrRd')  

In [27]:
sns.displot(df[df['target']==1]['msg_chars'],label='Spam')

In [28]:

sns.displot(df[df['target']==0]['msg_chars'],label='Ham')

In [29]:
plt.figure(figsize=(15,5))
sns.distplot(df[df['target']==1]['msg_chars'],label='Spam')
sns.distplot(df[df['target']==0]['msg_chars'],label='Ham')
plt.legend()

In [30]:
plt.figure(figsize=(15,5))
sns.distplot(df[df['target']==1]['msg_words'],label='Spam')
sns.distplot(df[df['target']==0]['msg_words'],label='Ham')
plt.legend()

In [31]:
plt.figure(figsize=(15,5))
sns.distplot(df[df['target']==1]['msg_sentences'],label='Spam')
sns.distplot(df[df['target']==0]['msg_sentences'],label='Ham')
plt.legend()

In [32]:
plt.figure(figsize=(15,15))
plt.subplot(311)
sns.histplot(df[df['target']==0]['msg_chars'],color='g',label='Ham')
sns.histplot(df[df['target']==1]['msg_chars'],color='r',label='Spam')
plt.legend()

plt.subplot(312)
sns.histplot(df[df['target']==0]['msg_words'],color='g',label='Ham')
sns.histplot(df[df['target']==1]['msg_words'],color='r',label='Spam')
plt.legend()

plt.subplot(313)
sns.histplot(df[df['target']==0]['msg_sentences'],color='g',label='Ham')
sns.histplot(df[df['target']==1]['msg_sentences'],color='r',label='Spam')
plt.legend()

In [33]:
sns.pairplot(df,hue='target')

In [34]:
# pearson coeff. correlation
sns.heatmap(df.corr(),annot=True)

- as no of chars increases changes of being spam also increases
- here we can see there is high correlation between no of chars,words and sentences.
- So, while model building we cannot keep all three features, because of multicolinearity,okly keep one 
- we will keep msg_chars because it has high correlation with target feature then others

# Data/Text Preprocessing
- So, basic text preprocessing are 
     - **lower case**(converts to lower case)
     - **tokenization** (break into words)
     - removing **special characters** (remove ?!~{}@#$%^)
     - removing **stop words**(is,if,and,the,an,a) and **punctuations** (?!.)
     - **stemming** (get root words, (loved,loving,loves=love))

In [35]:
def text_prep(text):
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    ps = PorterStemmer()

    # lower case converting
    text = text.lower()
    
    # tokenization
    text = nltk.word_tokenize(text)
    
    # removing special chars
    ls = []
    for i in text:
       if i.isalnum():
            ls.append(i)
    ls2 = []
    # remove stopwords
    for i in ls:
        if i not in stopwords.words('english'):
            ls2.append(i) 

    ls3 = []
    # apply stemming
    for i in ls2:
        ls3.append(ps.stem(i))
    
    return " ".join(ls3)

In [36]:
df['trans_msg']=df['msg'].apply(text_prep)

# Word Cloud

In [49]:
from wordcloud import WordCloud
wc = WordCloud(width=1200,height=500,min_font_size=10,background_color='white')

In [64]:
spam_wc = wc.generate(df['msg'].str.cat(sep=' ')) # object of words

plt.figure(figsize=(15,6))
plt.imshow(spam_wc)
plt.title('Frequently used words in Messages(in original msgs)',fontdict={'fontsize':20},loc='center',color='r')
plt.show()

In [65]:
spam_wc = wc.generate(df['trans_msg'].str.cat(sep=' ')) # object of words

plt.figure(figsize=(15,6))
plt.imshow(spam_wc)
plt.title('Frequently used words in Messages(in transformed msgs)',fontdict={'fontsize':20},loc='center',color='r')
plt.show()

In [61]:
spam_wc = wc.generate(df[df['target']==1]['trans_msg'].str.cat(sep=' ')) # object of words

plt.figure(figsize=(15,6))
plt.imshow(spam_wc)
plt.title('Frequently used words in Spam Messages(on transformed msgs)',fontdict={'fontsize':20},loc='center',color='r')
plt.show()

In [62]:
spam_wc = wc.generate(df[df['target']==1]['msg'].str.cat(sep=' ')) # object of words

plt.figure(figsize=(15,6))
plt.imshow(spam_wc)
plt.title('Frequently used words in Spam Messages(on original msgs)',fontdict={'fontsize':20},loc='center',color='r')
plt.show()

In [66]:
spam_wc = wc.generate(df[df['target']==0]['msg'].str.cat(sep=' ')) # object of words

plt.figure(figsize=(15,6))
plt.imshow(spam_wc)
plt.title('Frequently used words in Ham Messages(on original msgs)',fontdict={'fontsize':20},loc='center',color='r')
plt.show()

In [67]:
spam_wc = wc.generate(df[df['target']==0]['trans_msg'].str.cat(sep=' ')) # object of words
plt.figure(figsize=(15,6))
plt.imshow(spam_wc)
plt.title('Frequently used words in Ham Messages(on transformed msgs)',fontdict={'fontsize':20},loc='center',color='r')
plt.show()

In [None]:
from collections import Counter

In [93]:
spam_corpus =[] # list of all words
for i in (df[df['target']==1]['trans_msg'].tolist()):
    for word in i.split():
        spam_corpus.append(word)

# len(spam_corpus) # entire spam msgs words converted in to list

spam30 = pd.DataFrame(Counter(spam_corpus).most_common(30)) #most common 30 words in entire spam msgs only

plt.figure(figsize=(15,5))
sns.barplot(spam30[0],spam30[1])
plt.xticks(rotation=90)
plt.title('Most 30 words occured in Spam Messages',fontdict={'fontsize':20},loc='center',color='r')
plt.ylabel('Count')
plt.xlabel('Message word')
plt.show()


In [94]:
ham_corpus =[]
for i in (df[df['target']==0]['trans_msg'].tolist()):
    for word in i.split():
        ham_corpus.append(word)

# len(spam_corpus) # entire spam msgs words converted in to list

ham30 = pd.DataFrame(Counter(ham_corpus).most_common(30)) #most common 30 words in entire spam msgs only

plt.figure(figsize=(15,5))
sns.barplot(ham30[0],ham30[1])
plt.xticks(rotation=90)
plt.title('Most 30 words occured in Ham Messages',fontdict={'fontsize':20},loc='center',color='r')
plt.ylabel('Count')
plt.xlabel('Message word')
plt.show()


# Model Building
- Will try different model, start from Naive Bays
- Naive bays perform well on textual data
### ML model works on numerical data
- so first, convert text data to numerical or convert text to vectors
- will vectorization to convert text to num

# Note: 
- We gonna focus on precision score more then accuracy score.
- We don't want model predict ham messages as spam (False Positive), bcoz this could loss of important message
- if model predict spam messages to ham that will be acceptable(not recommended) (True Negative)

# Vectorization techniques
1. Bag of Words
2. TFIDF
3. 

## Bag of Words vectorizer

In [95]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [100]:
X = cv.fit_transform(df['trans_msg']).toarray()

In [101]:
X.shape

In [105]:
y = df['target'].values
y.shape

In [107]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [110]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix,precision_score

In [109]:
gb = GaussianNB()
mb = MultinomialNB()
bb = BernoulliNB()

In [116]:
gb.fit(X_train,y_train)
gb_pred = gb.predict(X_test)
print(accuracy_score(y_test,gb_pred))
print(confusion_matrix(y_test,gb_pred))
print(precision_score(y_test,gb_pred))

In [117]:
mb.fit(X_train,y_train)
mb_pred = mb.predict(X_test)
accuracy_score(y_test,mb_pred)
print(accuracy_score(y_test,mb_pred))
print(confusion_matrix(y_test,mb_pred))
print(precision_score(y_test,mb_pred))

In [118]:
bb.fit(X_train,y_train)
bb_pred = bb.predict(X_test)
accuracy_score(y_test,bb_pred)
print(accuracy_score(y_test,bb_pred))
print(confusion_matrix(y_test,bb_pred))
print(precision_score(y_test,bb_pred))

# Conclusion: 
- till here we got best result **97% accuracy** and **97% precision score** on **BernoulliNB**
- model predicting ham messages to spam message, this what we don't want

## let's try other techniques

# IFIDF Vectorizor

In [119]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [122]:
X = tfidf.fit_transform(df['trans_msg']).toarray()
y = df['target'].values

In [123]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [124]:
gb = GaussianNB()
mb = MultinomialNB()
bb = BernoulliNB()

In [125]:
gb.fit(X_train,y_train)
gb_pred = gb.predict(X_test)
print(accuracy_score(y_test,gb_pred))
print(confusion_matrix(y_test,gb_pred))
print(precision_score(y_test,gb_pred))

In [126]:
mb.fit(X_train,y_train)
mb_pred = mb.predict(X_test)
accuracy_score(y_test,mb_pred)
print(accuracy_score(y_test,mb_pred))
print(confusion_matrix(y_test,mb_pred))
print(precision_score(y_test,mb_pred))

In [127]:
bb.fit(X_train,y_train)
bb_pred = bb.predict(X_test)
accuracy_score(y_test,bb_pred)
print(accuracy_score(y_test,bb_pred))
print(confusion_matrix(y_test,bb_pred))
print(precision_score(y_test,bb_pred))

# Here we got 95.9% accuracy and 100% precision score on MultinomialNB
- so we have achieved 100% precision score, that's we want
- although BernoulliNB giving 97% accuracy, but precision score also 97%, which means this can predict ham message as spam.
- So, accuracy not much important