Importing the necessary libraries

In [None]:
import numpy as np
import pandas as pd

In [None]:
df=pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv",encoding='ISO-8859-1')

In [None]:
df.sample(5)

In [None]:
df.shape

1. Data cleaning
2. EDA
3. Text Preprocessing
4. Model building
5. Evaluation
6. Improvement
7. Website
8. Deploy

**1. Data cleaning**

In [None]:
#Data cleaning
df.info()

In [None]:
# drop last 3 cols
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
df.sample(5)

In [None]:
# rename the cols
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [None]:
df['target']=encoder.fit_transform(df['target'])

In [None]:
df.head()

In [None]:
# missing values
df.isnull().sum()

In [None]:
# check for duplicate values
df.duplicated().sum()

In [None]:
df=df.drop_duplicates(keep='first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape

**2. EDA**

In [None]:
# EDA
df.head()

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct='%0.2f')
plt.show()

In [None]:
# data is imbalanced

In [None]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
df['num_characters']=df['text'].apply(len)

In [None]:
df.head()

In [None]:
# num of words
df['num_words']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df.head()

In [None]:
df['num_sentences']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
df[df['target']==0][['num_characters','num_words','num_sentences']].describe()

In [None]:
# spam
df[df['target']==1][['num_characters','num_words','num_sentences']].describe()

In [None]:
import seaborn as sns

In [None]:
sns.histplot(df[df['target']==0]['num_characters'])
sns.histplot(df[df['target']==1]['num_characters'],color='red')

In [None]:
sns.pairplot(df,hue='target')

In [None]:
sns.heatmap(df.corr(),annot=True)

**3. Data Preprocssing**
* Lower case
* Tokenization
* Removing special characters
* Removing stop words and punctuation
* Stemming

In [None]:
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
def transform_text(text):
    text=text.lower()
    text=nltk.word_tokenize(text)
    
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    text=y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text=y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
transform_text('Hi how Are YOU nitish?Dance')

In [None]:
df['transformed_text']=df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
from wordcloud import WordCloud
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
spam_wc=wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep=""))

In [None]:
plt.imshow(spam_wc)

In [None]:
ham_wc=wc.generate(df[df['target']==0]['transformed_text'].str.cat(sep=""))
plt.imshow(ham_wc)

In [None]:
spam_corpus=[]
for msg in df[df['target']==1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter
sns.barplot(x=pd.DataFrame(Counter(spam_corpus).most_common(30))[0],y=pd.DataFrame(Counter(spam_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
ham_corpus=[]
for msg in df[df['target']==0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
len(ham_corpus)

In [None]:
from collections import Counter
sns.barplot(x=pd.DataFrame(Counter(ham_corpus).most_common(30))[0],y=pd.DataFrame(Counter(ham_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

**4. Model Building**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv=CountVectorizer()
tfidf=TfidfVectorizer(max_features=3000)

In [None]:
x=tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# scaler=MinMaxScaler()
# x=scaler.fit_transform(x)

In [None]:
x.shape

In [None]:
y=df['target'].values

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

In [None]:
gnb.fit(x_train,y_train)
y_pred1=gnb.predict(x_test)
print('Accuracy:',accuracy_score(y_test,y_pred1))
print('Confusion matrix:',confusion_matrix(y_test,y_pred1))
print('Precision score:',precision_score(y_test,y_pred1))

In [None]:
mnb.fit(x_train,y_train)
y_pred2=mnb.predict(x_test)
print('Accuracy:',accuracy_score(y_test,y_pred2))
print('Confusion matrix:',confusion_matrix(y_test,y_pred2))
print('Precision score:',precision_score(y_test,y_pred2))

In [None]:
bnb.fit(x_train,y_train)
y_pred3=bnb.predict(x_test)
print('Accuracy:',accuracy_score(y_test,y_pred3))
print('Confusion matrix:',confusion_matrix(y_test,y_pred3))
print('Precision score:',precision_score(y_test,y_pred3))

In [None]:
# tfidf --> MNB