In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


In [2]:
# Loading the dataset
df = pd.read_csv('/content/spam.csv', header=None, names=['label', 'text'], sep='\t', encoding='latin-1')

In [3]:
# Exploring the data
print(df.head())
print(df.info())
print(df.describe())

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
       label                    text
count   5572                    5572
unique     2                    5169
top      ham  Sorry, I'll call later
freq    4825                      30


In [4]:
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


# Pre-processing the data
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stop_words] # use stop_words instead of stopwords
    text = ' '.join(text)
    return text

# apply the clean_text function to the 'text' column
df['text'] = df['text'].apply(clean_text)

In [6]:
# Extracting features from the text
cv = CountVectorizer()
X = cv.fit_transform(df['text']).toarray()
y = df['label'].values


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Training and evaluating the logistic regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

acc_score = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy of Logistic Regression: {acc_score *100:.2f} %")
print("Confusion Matrix:\n", conf_matrix)

Accuracy of Logistic Regression: 98.39 %
Confusion Matrix:
 [[965   1]
 [ 17 132]]


In [12]:
def predict_email(email):
    cleaned_email = clean_text(email)
    features = cv.transform([cleaned_email]).toarray()
    prediction = lr.predict(features)[0]
    if prediction == 0:
        print("Not Spam")
    else:
        print("Spam")

predict_email("Congratulations! You have been selected to win a prize!")

Spam


Checking the accuracy of models such as Support Vector Machine and K Nearest Neighbours.

In [13]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import math



model = SVC(kernel='poly')
model.fit(X_train, y_train)
predicted = model.predict(X_test)
acc = accuracy_score(y_test, predicted)
print(f"Accuracy of Support Vector Machine: {acc* 100:.2f} %")

Accuracy of Support Vector Machine: 94.89 %


In [14]:
from sklearn.neighbors import KNeighborsClassifier
model_KNN = KNeighborsClassifier()
model_KNN.fit(X_train, y_train)

# Predict the target values for test set
y_pred_KNN = model_KNN.predict(X_test)

# Evaluate the KNN model
accuracy_KNN = accuracy_score(y_test, y_pred_KNN)

# Store the results of KNN model in the dictionary
print( f"Accuracy of KNN model:  {accuracy_KNN*100:.2f} %")

Accuracy of KNN model:  92.65 %


Therefore we can see that Logistic Regression has the highest Accuracy rate.