# EMAIL SPAM DETECTION WITH MACHINE LEARNING

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [9]:
data = pd.read_csv("C:/B.Tech/V SEM/OASIS internship/Task-4/archive (11)/spam.csv", encoding='latin1')  # Replace 'your_dataset.csv' with the actual dataset file path
# Combine all the data from 'Unnamed' columns into the 'v2' column
data['v2'] = data[['v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

In [10]:
# Select only the 'v1' and 'v2' columns and ignore the rest
data = data[['v1', 'v2']]

In [11]:
data.head()


Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
data.iloc[4]

v1                                                  ham
v2    Nah I don't think he goes to usf, he lives aro...
Name: 4, dtype: object

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [15]:
data.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [14]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features as needed
X = tfidf_vectorizer.fit_transform(data['v2'])
y = data['v1']


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)


In [18]:
y_pred = classifier.predict(X_test)


In [20]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)


In [21]:
print(f"Accuracy: {accuracy}")

Accuracy: 0.9704035874439462


In [22]:
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[965   0]
 [ 33 117]]


In [23]:
print("Classification Report:")
print(class_report)


Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       965
        spam       1.00      0.78      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

