**Importing Libraries**

In [None]:
import nltk
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

**Reading the dataset**

In [None]:
data = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv',encoding = 'latin-1')
data.head()

**Dropping the empty unwanted column in next step we are changing the column names for better understanding then
Mapping the label to the numeric format for the classification.**

In [None]:
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis = 1,inplace = True)
data.rename(columns = {'v1':'Label','v2':'Text'},inplace = True)
data['Label'] = data.Label.map({'ham':0,'spam':1})
data.head()

**Plotting the Count of datas**

In [None]:
sns.countplot(data.Label)

**Checking for duplicate and removing those duplicate values from the data**

In [None]:
data.groupby('Label').describe()

We found that in **ham** there is 309(4825-4516)duplicates and in **spam** 94(747-653)duplicates

In [None]:
#Dropping duplicate rows
data = data.drop_duplicates()
data.groupby('Label').describe()

Adding Length column which is depending upon the length of the text in the row

In [None]:
data['Length'] = data['Text'].apply(len)
data.head()

plotting the hist depending upon the length of the Text

In [None]:
plt.figure(figsize = (10,6))
data[data.Label==0].Length.plot(bins = 40,kind = 'hist',color = 'green',label = 'ham messages',alpha = 0.7)
data[data.Label== 1].Length.plot(bins = 10,kind = 'hist',color = 'red',label = 'Spam messages',alpha = 0.8)
plt.legend()
plt.xlabel('Length')

In [None]:
data[data['Label']==0].describe()

In [None]:
data[data['Label']==1].describe()

From above After Comparing the both we have find that Spam messages have more length than Ham messages

In [None]:
stemmer = SnowballStemmer("english")
stop = stopwords.words('english')

In [None]:
#Removing Stopwords
data['Clean_text'] = data['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#Adding a column to show the length of the Text after removing the Stopwords
data['stopword_len'] = data['Clean_text'].apply(len)
#Removing special character,digits,...
data['Clean_text']= data['Clean_text'].str.replace('''[,”“’''--./<“>-?1234567890-=\|+_)(*&^%$#@!`~:"{}]''','',case=False)
#Removing the Links,websites,...
data['Clean_text']= data['Clean_text'].str.replace('http\S+|www.\S+', '', case=False)
#Making the text to lower case
data['Clean_text']= data['Clean_text'].map(lambda x: x.lower())
#Tokenization
data['Clean_text']= data.apply(lambda row: nltk.word_tokenize(row['Clean_text']), axis=1)
#Stemming
data['Clean_text']= data['Clean_text'].apply(lambda row:[stemmer.stem(y) for y in row])

After cleaning the Text 

In [None]:
data.head()

Next step is to join the tokens together

In [None]:
data['Final_text'] = ''
for i in data.index:
    text = ' '.join(data['Clean_text'][i])
    data['Final_text'][i] = text
    

Now drop the dirty columns.

In [None]:
data.drop(['Text','Length','Clean_text','stopword_len'],inplace = True,axis =1)

In [None]:
data.head()

In [None]:
x = data.Final_text
y = data.Label

Splitting the data into 80% for training and 20% for testing

In [None]:
size = round(len(y)*0.8)
x_train = x[:size]
x_test = x[size:]
y_train = y[:size]
y_test = y[size:]
print('x_train',x_train.shape)
print('x_test',x_test.shape)
print('y_train',y_train.shape)
print('y_test',y_test.shape)

Now need to apply the machine learning algorithm.

* 1st we need to take term frequency.
* 2nd for high frequency value we need to reduce the frequency that is known as inverse document frequency

In [None]:
vect = CountVectorizer()
vect.fit(x_train)

Fitting vocabulary

In [None]:
x_train = vect.fit_transform(x_train)
x_train

In [None]:
x_test = vect.transform(x_test)
x_test

In [None]:
def tsne_plot(x, y):

    # Setting the plotting background
    sns.set(style ="whitegrid")
      
    tsne = TSNE(n_components = 2, random_state = 0)
      
    # Reducing the dimensionality of the data
    X_transformed = tsne.fit_transform(x)
      
    plt.figure(figsize =(10, 6))
  
    # Building the scatter plot
    plt.scatter(X_transformed[np.where(y == 0), 0], 
                X_transformed[np.where(y == 0), 1],
                marker ='o', linewidth =1,
                alpha = 0.8, label ='hem')
    plt.scatter(X_transformed[np.where(y == 1), 0],
                X_transformed[np.where(y == 1), 1],
                marker ='o', linewidth =1,
                alpha = 0.8, label ='spam')
  
    # Specifying the location of the legend
    plt.legend(loc ='best')


tsne_plot(x_train,y_train)

In [None]:
nb = MultinomialNB()
nb.fit(x_train, y_train)
y_pred_class = nb.predict(x_test)
print('Accuracy : ',metrics.accuracy_score(y_test, y_pred_class))