In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn import svm 
from sklearn import ensemble

In [2]:
#Load data
mail_data=pd.read_csv('spam.csv')
#label spam mail as 0 and ham mail as 1
mail_data['Category'] = mail_data['Category'].map({'spam': 0, 'ham': 1})

In [3]:
#Split data into features and labels
X = mail_data['Message']
Y = mail_data['Category']

In [4]:
#Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=4)


In [5]:
#Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
#Convert labels to integer
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [6]:
#Fit and transform the training data, transform the test data
X_train_vectorizer=tfidf_vectorizer.fit_transform(X_train)
X_test_vectorizer=tfidf_vectorizer.transform(X_test)
#Assign transformed data
X_train=X_train_vectorizer
X_test=X_test_vectorizer

In [7]:
#Initialize SVM model with linear kernel
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train, Y_train)


In [8]:

svm_prediction_training_data=svm_model.predict(X_train)
accuracy_score_training_data=accuracy_score(Y_train,svm_prediction_training_data)
print('Accuracy on training data: ',accuracy_score_training_data)

Accuracy on training data:  0.9961857751851021


In [9]:
svm_prediction_test_data=svm_model.predict(X_test)
accuracy_score_test_data=accuracy_score(Y_test,svm_prediction_test_data)
print('Accuracy on test data: ',accuracy_score_test_data)

Accuracy on test data:  0.9838565022421525


In [10]:
#Initialize Random Forest model
rf_model=ensemble.RandomForestClassifier()
rf_model.fit(X_train,Y_train)

In [11]:
rf_prediction_training_data=rf_model.predict(X_train)
accuracy_score_training_data=accuracy_score(Y_train,rf_prediction_training_data)
print('Accuracy on training data: ',accuracy_score_training_data)

Accuracy on training data:  1.0


In [12]:
rf_prediction_test_data=rf_model.predict(X_test)
accuracy_score_test_data=accuracy_score(Y_test,rf_prediction_test_data)
print('Accuracy on test data: ',accuracy_score_test_data)

Accuracy on test data:  0.9730941704035875


In [13]:
#Function to predict if a new mail is spam or ham acc to svm algo
def classify_email(email):
    email_vectorizer = tfidf_vectorizer.transform([email])
    prediction = svm_model.predict(email_vectorizer)
    return 'ham' if prediction == 1 else 'spam'


In [14]:
email1="Hii I am osdnj"
print('The new email is classified as:', classify_email(email1))


The new email is classified as: ham


In [15]:
email2="Congratulations! You've won a $1000 cash prize! Claim it now by clicking the link below:https://example.com/claim-prize"
print('The new email is classified as:', classify_email(email2))


The new email is classified as: spam
