**Importing Libraries**

In [None]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

**Importing the dataset**

In [None]:
df=pd.read_csv("../input/spam-text-message-classification/SPAM text message 20170820 - Data.csv")

**Understanding the dataset and rectifying it**

In [None]:
#Displaying the first 5 rows of the dataset
df.head()

In [None]:
#defining the target variable
target=df['Category']

In [None]:
#defining the indepedent feature
x=df['Message']

In [None]:
#storing the stopwords from the library in the variable stopword
stopword=stopwords.words('english')

In [None]:
#initializing porterstemmer for doing stemming
ps=PorterStemmer()

In [None]:
#Rectifying the dataset
for i in range(len(x)):
  word=re.sub('[^a-zA-Z]',' ',x[i])
  word=word.lower()
  word=word_tokenize(word)
  word=[ps.stem(words) for words in word if words not in stopword]
  x[i]=' '.join(word)

In [None]:
#Converting the data into vectors using Bag Of Words
cv=CountVectorizer(max_features=3000)
x_vector=cv.fit_transform(x).toarray()

In [None]:
#converting the dependent variable column dichotomous values
target=target.replace('ham',1)
target=target.replace('spam',0)

In [None]:
#copying the target variable in the variable y
y=target.copy(deep=True)

In [None]:
#understanding whether the dataset is balanced or not
y.value_counts()


In [None]:
sns.countplot(y)
plt.show()


**So we can see that the dataset is highly imbalanced.We will use F1-Score to understand the model efficacy, due to this reason.**

In [None]:
#splitting the data into train and test
X_train,X_test,y_train,y_test=train_test_split(x_vector,y,test_size=0.3,random_state=10)

In [None]:
#using naive bayes to train the model
NB=MultinomialNB()
model=NB.fit(X_train,y_train)

In [None]:
#predicting on the test data
y_pred=model.predict(X_test)

In [None]:
#calculating the metrics for the model
F1_score=f1_score(y_pred,y_test)
Accuracy=accuracy_score(y_test,y_pred)
Classification_report=classification_report(y_test,y_pred)
Confusion_matrix=confusion_matrix(y_test,y_pred)

In [None]:
#Displaying the result of the model
print(F"The F1-Score of the model is {F1_score}")
print(F"The Accuracy of the model is {Accuracy}")
print(F"The Classification_report of the model is")
print(Classification_report)
print(F"The Confusion matrix of the model is")
print(Confusion_matrix)

**Deduction:**

1)F1 score is pretty well for our model which is 96%

2)The false negatives are 276 we need to reduce this number in order to make our model better.

**USING LEMMATIZER AND TF-IDF**

In [None]:
x=df['Message']

In [None]:
#instantiating the lemmatizer
ls=WordNetLemmatizer()

In [None]:
#cleaning the text 
for i in range(len(x)):
  word=re.sub('[^a-zA-Z]',' ',x[i])
  word=word.lower()
  word=word_tokenize(word)
  word=[ls.lemmatize(words) for words in word if words not in stopword]
  x[i]=' '.join(word)

In [None]:
#converting the text into vector in order to train the model
T=TfidfVectorizer()
x_vector=T.fit_transform(x)

In [None]:
#splitting the dependent and independent variables
X_train,X_test,y_train,y_test=train_test_split(x_vector,y,train_size=0.3,random_state=10)

In [None]:
#instantiating and fitting the model
NB=MultinomialNB()
model1=NB.fit(X_train,y_train)

In [None]:
#predicting on the test data
y_pred=model1.predict(X_test)

In [None]:
F1_score=f1_score(y_pred,y_test)
Accuracy=accuracy_score(y_test,y_pred)
Classification_report=classification_report(y_test,y_pred)
Confusion_matrix=confusion_matrix(y_test,y_pred)

In [None]:
#displaying the results
#Displaying the result of the model
print(F"The F1-Score of the model is {F1_score}")
print(F"The Accuracy of the model is {Accuracy}")
print(F"The Classification_report of the model is")
print(Classification_report)
print(F"The Confusion matrix of the model is")
print(Confusion_matrix)

**So we can see that BOW gave us a better result than TF-IDF for the following use case. **