In [1]:
# Implementing the Random Forest Classifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
#path to dataset which is downloaded from kaggle: https://www.kaggle.com/datasets/venky73/spam-mails-dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/email_spam_detection/spam_ham_dataset.csv') #path to dataset which is downloaded from kaggle


In [3]:
# splitting the dataset for trainning and testing
X = df['text']
y = df['label_num']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25, random_state=42)

In [4]:
# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [5]:
# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [6]:
# Train the classifier
rf_classifier.fit(X_train_tfidf, y_train)


In [7]:
# Predict labels for the test set
y_pred = rf_classifier.predict(X_test_tfidf)

In [8]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.97138437741686

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       930
           1       0.97      0.92      0.95       363

    accuracy                           0.97      1293
   macro avg       0.97      0.96      0.96      1293
weighted avg       0.97      0.97      0.97      1293



In [9]:
# funtion to check your own mail
def check(text):
  new_email = text
  new_email_tfidf = vectorizer.transform(new_email)
  prediction = rf_classifier.predict(new_email_tfidf)
  print("\nPredicted label for the new email:", prediction[0])


In [10]:
# Example of predicting a new email where 1 means spam of phishing threat and 0 means not a spam
new_email = ["When you are borrowing money, depending upon your need, you can choose different types of loans and credit. Having a mix of different credit accounts, which is called as a Credit Mix, is good for the health of your credit profile, because lenders and creditors generally check how you have managed different types of accounts in the past. A good Credit Mix adds a small portion to your Experian credit score. But if you have varied accounts and paying them on time, it makes lenders think that you are being responsible in handling your credits. Check your Experian credit report today to know your Credit Mix. Planning to apply for a loan or credit card? Find out which credit type is best for you.	Automatically renewed as you pay debt	Interest is fixed, which you pay in EMIs	Borrower can get access to loan amount once approved Example: Credit cards, retail store cards. *Generally unsecured and open credit	Automatically renewed as you pay debt	Interest is fixed, which you pay in EMIs Borrower can get access to loan amount once approved Example: Home loan, automobile loan, student loans.	*Generally secured and closed credit Automatically renewed as you pay debt Interest is fixed, which you pay in EMIs Borrower can get access to loan amount once approved Example: Personal loan. *It can be secured or unsecured Check Free Experian Score www.experian.in	| Experian and the marks used herein are service marks or registered trademark of Experian Information Solutions, Inc. Other product and company names mentioned herein may be the trademarks of their respective owners. The contents and updates in this e-mail are for informational purposes only and the contents herein do not constitute any advice whatsoever. Use of any of the information from this e-mail shall be at the sole discretion of the client. Whilst making all reasonable efforts to provide correct information, Experian cannot and does not warrant or guarantee, whether expressly or impliedly, that the data provided in this e-mail will be accurate. Experian shall have no liability or responsibility, whatsoever, in contract, tort or any other legal ground for any inaccuracy, incompleteness, omission, lack of timeliness or any other error of the content and in no event will Experian be liable for any damages whatsoever arising out of the use of or reliance on the contents of this e-mail communication. Please refer Experian's Privacy Policy for additional details. . © Experian 2023/04/20_COMEX_EM --Click Here to unsubscribe from this newsletter. 9 Attachments Scanned by Gmail"]
new_email_tfidf = vectorizer.transform(new_email)
prediction = rf_classifier.predict(new_email_tfidf)
print("\nPredicted label for the new email:", prediction[0])


Predicted label for the new email: 0
