In [10]:
#Main code for the step
import base64
import re
import os
import pandas as pd
import nltk
from fastapi import FastAPI
from googleapiclient.discovery import build
from google.oauth2.credentials import Credentials
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score
from plyer import notification
from datetime import datetime
from apscheduler.schedulers.background import BackgroundScheduler
from joblib import dump
nltk.download('stopwords')
# Load and train the model
def clean_text(text):
    if isinstance(text, float):
        text = str(text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))
    return text

data = pd.read_csv('Book1.csv')
data['text'] = data['text'].fillna('').apply(clean_text)
data = data[data['label'].isin(['legitimate', 'phishing'])]
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(data['text']).toarray()
y = data['label'].map({'legitimate': 0, 'phishing': 1}).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

dump(model, 'model.joblib')
dump(tfidf, 'tfidf_vectorizer.joblib') # saving the code for not to train again

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sidda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.96
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       382
           1       0.93      0.96      0.94       205

    accuracy                           0.96       587
   macro avg       0.95      0.96      0.96       587
weighted avg       0.96      0.96      0.96       587



['tfidf_vectorizer.joblib']

In [None]:
#This code is for after saving the above code to access the saved code this is the code
#Note: After excuting the first code you directly run the third code
# second is for just to show how to run the saved model
import base64
import re
import os
import pandas as pd
import nltk
from fastapi import FastAPI
from googleapiclient.discovery import build
from google.oauth2.credentials import Credentials
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score
from plyer import notification
from datetime import datetime
from apscheduler.schedulers.background import BackgroundScheduler
from joblib import load

nltk.download('stopwords')

# Load the trained model and TF-IDF vectorizer
model = load('model.joblib')
tfidf = load('tfidf_vectorizer.joblib')

def clean_text(text):
    if isinstance(text, float):
        text = str(text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))
    return text

In [6]:
import os.path
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

SCOPES = ['https://www.googleapis.com/auth/gmail.modify']

creds = None

if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
        creds = flow.run_local_server(port=0)
    with open('token.json', 'w') as token:
        token.write(creds.to_json())

service = build('gmail', 'v1', credentials=creds) #after running this code click on the below link and sign in to your gmail account
#and to sign in to the gmail account get the gmail api creditionals

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=559093843797-4usqlmepggg7qafq1n0d39i5moas14a9.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A65506%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fgmail.modify&state=WnO84GNNk1Hd8vWEPHu0rgCvE6qfHv&access_type=offline


In [7]:
#after running the above two codes run this code to get the output
from datetime import datetime, timedelta
# Email classification function
def classify_email(email_content, model, tfidf):
    email_content_clean = clean_text(email_content)
    email_features = tfidf.transform([email_content_clean]).toarray()
    return 'phishing' if model.predict(email_features)[0] else 'legitimate'

def fetch_today_emails(service, model, tfidf):
    # Get today's date
    today = datetime.now().date()

    # Specify the list of label IDs to fetch emails from
    label_ids = ['INBOX', 'SPAM', 'SENT', 'IMPORTANT', 'TRASH', 'DRAFT', 'CATEGORY_UPDATES', 'CATEGORY_FORUMS', 'CATEGORY_PROMOTIONS', 'CATEGORY_SOCIAL']

    for label_id in label_ids:
        # Call the Gmail API to fetch emails from the current label
        results = service.users().messages().list(userId='me', labelIds=[label_id], maxResults=10).execute()
        messages = results.get('messages', [])

        if not messages:
            print(f'No messages found in label {label_id}.')
        else:
            for message in messages:
                msg = service.users().messages().get(userId='me', id=message['id']).execute()
                payload = msg['payload']
                data = ''

                if 'parts' in payload:
                    for part in payload['parts']:
                        if part.get('mimeType') == 'text/plain' and 'data' in part['body']:
                            data = part['body']['data']
                            break
                elif 'body' in payload and 'data' in payload['body']:
                    data = payload['body']['data']

                if data:
                    # Get the date of the email
                    date_str = msg['internalDate']
                    date = datetime.fromtimestamp(int(date_str) / 1000).date()

                    # Check if the email is from today
                    if date == today:
                        msg_str = base64.urlsafe_b64decode(data.encode('ASCII')).decode('utf-8')
                        classification = classify_email(msg_str, model, tfidf)  # Pass model and tfidf arguments
                        print(f"Message snippet: {msg['snippet']}")
                        print(f"Classification: {classification}")
                        if classification == 'phishing' and label_id != 'SPAM':  # Add condition to avoid modifying 'SPAM' label
                            # Move to spam
                            service.users().messages().modify(userId='me', id=message['id'], body={'addLabelIds': ['SPAM']}).execute()

# Fetch and classify emails received today
fetch_today_emails(service, model, tfidf)


Message snippet: LIVE WEBINAR 04 Jul 2024 12:00 PM (GMT+5.5) Hosted by: Sanjeev Azad - GlobalLogic, Amit Kumar Shrivastava - Fujitsu India, Jaya Kishore Reddy - Yellow.ai, Jayachandran Ramachandran - C5i, Ramesh
Classification: legitimate
Message snippet: Daily Newsletter Monday, 1st July 2024 DXC Technology in trouble over delayed onboarding of 4800 freshers It&#39;s raining jobs at Hexaware Technologies for IT professionals; check jobs here From
Classification: phishing
Message snippet: Srinu, join groups to connect with people who share your interests. Join groups to connect with people who share your interests. RECOMMENDED FOR YOU Cote De Pablo COTE DE PABLO FANS (NCIS) Visit Group
Classification: phishing
Message snippet: Daily Newsletter Monday, 1st July 2024 Goldman Sachs hiring freshers for Summer Analyst Programme; CS and IT grads apply now Zig emerges as highest-paying programming language in 2024 Loyalty Programs
Classification: legitimate
Message snippet: Daily Newsletter M