# Clean up existing installations and

**Clean Reinstallation of Required Libraries**

In [None]:
!pip uninstall -y transformers tokenizers huggingface-hub accelerate datasets


Found existing installation: transformers 4.30.2
Uninstalling transformers-4.30.2:
  Successfully uninstalled transformers-4.30.2
Found existing installation: tokenizers 0.13.3
Uninstalling tokenizers-0.13.3:
  Successfully uninstalled tokenizers-0.13.3
Found existing installation: huggingface-hub 0.16.4
Uninstalling huggingface-hub-0.16.4:
  Successfully uninstalled huggingface-hub-0.16.4
Found existing installation: accelerate 0.20.3
Uninstalling accelerate-0.20.3:
  Successfully uninstalled accelerate-0.20.3
Found existing installation: datasets 2.13.1
Uninstalling datasets-2.13.1:
  Successfully uninstalled datasets-2.13.1


In [None]:
import transformers
print(transformers.__version__)


4.57.1


**Environment Setup and Dependencies Installation**

In [None]:
# Install all required packages at once
!pip install datasets transformers torch accelerate scikit-learn pandas

Collecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch
  Using cached torch-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting accelerate
  Downloading accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014

In [None]:
import torch, transformers, datasets
print("torch:", torch.__version__)
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)


torch: 2.9.0+cu128
transformers: 4.57.1
datasets: 4.3.0


**Build and Export Synthetic Email Classification Dataset**

Preparing a balanced dataset by augmenting sample email texts and saving them as email_classifier_dataset.csv.

In [3]:
import pandas as pd
import random

career = [
    "Your application for the Data Analyst position has been received",
    "Congratulations! You have been shortlisted for interview at Infosys",
    "We are hiring software engineers for 2025 intake",
    "Job vacancy: Junior Civil Engineer",
    "Internship opportunity for college students",
    "Invitation to campus placement drive",
    "Offer letter from TCS has been issued",
    "Career opportunities at Deloitte",
    "Recruitment drive this weekend",
    "New job role available at Accenture",
]


promotion = [
    "Limited time sale on electronics at Amazon",
    "50% discount on fashion items this week",
    "Your Flipkart Big Billion Days offer is live",
    "Cashback rewards on your next purchase",
    "Hurry! Offer ends soon on beauty products",
    "Exclusive Diwali deals on gadgets",
    "Special offer: Buy one get one free",
    "Flash sale starting at midnight",
    "Sale alert! Discounts on smartphones",
    "Celebrate with new year offers from Nykaa",
]

personal = [
    "Hey, how are you doing today?",
    "Happy birthday! Wishing you a great year ahead",
    "Let‚Äôs plan a trip this weekend",
    "Dinner tonight at our usual place?",
    "Thanks for your help yesterday!",
    "Hope you‚Äôre doing well",
    "Long time no see! Let‚Äôs catch up soon",
    "Congratulations on your graduation!",
    "Good morning! Have a nice day",
    "Take care and stay safe",
]

# Duplicate with slight variations
def augment(lst):
    aug = []
    for text in lst:
        aug.append(text)
        aug.append(text.replace("!", "").replace(".", ""))
        aug.append(text.lower())
        aug.append("Re: " + text)
        aug.append("Fwd: " + text)
    return aug

data = {
    "text": augment(career) + augment(promotion) + augment(personal),
    "label": (
        ["career"] * len(augment(career))
        + ["promotion"] * len(augment(promotion))
        + ["personal"] * len(augment(personal))
    )
}

df = pd.DataFrame(data)
df.to_csv("email_classifier_dataset.csv", index=False)
print(" Dataset created:", df.shape)


 Dataset created: (150, 2)


**Train and Evaluate BERT Model for Email Classification**

Loading the dataset, tokenizing text, encoding labels, and fine-tuning a pre-trained BERT model to classify emails into Career, Promotion, and Personal categories.

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers.modeling_utils import PreTrainedModel
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load dataset
dataset = load_dataset("csv", data_files="email_classifier_dataset.csv")["train"].train_test_split(test_size=0.2)
train_ds, test_ds = dataset["train"], dataset["test"]

label2id = {"career":0, "scheduled":1, "promotion":2, "personal":3}
id2label = {v:k for k,v in label2id.items()}

# Tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

def encode_labels(batch):
    batch["label"] = [label2id[l] for l in batch["label"]]
    return batch

train_ds = train_ds.map(encode_labels, batched=True)
test_ds = test_ds.map(encode_labels, batched=True)

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

args = TrainingArguments(
    output_dir="./email_ai_agent",
    eval_strategy="epoch",  # Changed from evaluation_strategy
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()
trainer.evaluate()

model.save_pretrained("./email_ai_agent")
tokenizer.save_pretrained("./email_ai_agent")

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

In [None]:
model.save_pretrained("./email_career_classifier")
tokenizer.save_pretrained("./email_career_classifier")
print(" Model saved to ./email_career_classifier")


 Model saved to ./email_career_classifier


**Model Inference and Prediction Pipeline**

In [None]:
from transformers import pipeline

In [None]:

model_path = "./email_career_classifier"
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)


Device set to use cpu


In [None]:
!pip install --upgrade google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client transformers torch


In [None]:
from google.colab import files
files.upload()


Saving credentials.json to credentials (2).json


{'credentials (2).json': b'{"web":{"client_id":"1099353314869-31ckl4r62qpqnrpkssfqo06gdiucd32g.apps.googleusercontent.com","project_id":"emailautomationagent-476713","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://oauth2.googleapis.com/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_secret":"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","redirect_uris":["http://localhost:8080","http://127.0.0.1:8080","http://localhost:3000"]}}'}

**Configure Gmail API Authentication Credentials**

Saving the provided OAuth 2.0 client credentials as a credentials.json file to enable Gmail API authentication.

In [None]:
import json

credentials_data = """
{
  "installed": {
    "client_id": "1099353314869-4tsefcqrgegb5trs87fdbfksr8cvlmor.apps.googleusercontent.com",
    "project_id": "emailautomationagent",
    "auth_uri": "https://accounts.google.com/o/oauth2/auth",
    "token_uri": "https://oauth2.googleapis.com/token",
    "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
    "client_secret": "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",
    "redirect_uris": ["http://localhost"]
  }
}
"""

with open("credentials.json", "w") as f:
    f.write(credentials_data)

print(" credentials.json file created successfully!")


 credentials.json file created successfully!


**Install Google API Client Libraries**

In [None]:
!pip install --upgrade google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client


Collecting google-auth
  Downloading google_auth-2.42.1-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting google-auth-oauthlib
  Downloading google_auth_oauthlib-1.2.3-py3-none-any.whl.metadata (3.1 kB)
Collecting google-auth-httplib2
  Downloading google_auth_httplib2-0.2.1-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-python-client
  Downloading google_api_python_client-2.186.0-py3-none-any.whl.metadata (7.0 kB)
Collecting google-auth
  Downloading google_auth-2.41.1-py2.py3-none-any.whl.metadata (6.6 kB)
Downloading google_auth_oauthlib-1.2.3-py3-none-any.whl (19 kB)
Downloading google_auth-2.41.1-py2.py3-none-any.whl (221 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m221.3/221.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_auth_httplib2-0.2.1-py3-none-any.whl (9.5 kB)
Downloading google_api_python_client-2.186.0-py3-none-any.whl (14.5 MB)
[2K   

**Authenticate and Connect to Gmail API**

In [None]:
import google_auth_oauthlib.flow
print(google_auth_oauthlib.flow.__file__)

from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

# Gmail API scope (modify = read, star, archive, etc.)
SCOPES = ['https://www.googleapis.com/auth/gmail.modify']

# Path to your downloaded credentials.json (web type)
CLIENT_SECRET_FILE = 'credentials.json'

# Initialize OAuth flow
flow = InstalledAppFlow.from_client_secrets_file(
    CLIENT_SECRET_FILE,
    SCOPES,
    redirect_uri='https://developers.google.com/oauthplayground'  # Web redirect URI
)

# Step 1: Generate authorization URL
auth_url, _ = flow.authorization_url(prompt='consent')
print(" Go to this URL and authorize access:")
print(auth_url)

# Step 2: After authorization, paste the code from the browser
code = input("\n Enter the authorization code here: ")

# Step 3: Fetch the access token using the code
flow.fetch_token(code=code)
creds = flow.credentials

# Step 4: Build Gmail API service
service = build('gmail', 'v1', credentials=creds)
print(" Gmail API connected successfully!")


/usr/local/lib/python3.12/dist-packages/google_auth_oauthlib/flow.py
 Go to this URL and authorize access:
https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=1099353314869-31ckl4r62qpqnrpkssfqo06gdiucd32g.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fdevelopers.google.com%2Foauthplayground&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fgmail.modify&state=IAdZfVbPaSlfvG7kNEqQCqQ3KXwbvL&prompt=consent&access_type=offline

 Enter the authorization code here: 4/0Ab32j92SnMknfdDzuczxt1vu2-h5wjOiCrCmetVoEHGp2KjcX3dwcrYvPT5tiVHKN0Hb3A
 Gmail API connected successfully!


In [None]:
results = service.users().messages().list(userId='me', labelIds=['INBOX'], q="is:unread").execute()
messages = results.get('messages', [])

print(f" Found {len(messages)} unread messages.")


 Found 100 unread messages.


**Email Classification and Automation Agent**

Fetches emails using the Gmail API, classifies them using the fine-tuned BERT model, and performs automated actions

In [None]:
from transformers import pipeline
import base64
import re

# ---- Load your fine-tuned model ----
classifier = pipeline(
    "text-classification",
    model="./email_ai_agent",
    tokenizer="./email_ai_agent"
)

# ---------- Helper to get recent emails ----------
def get_recent_emails(service, max_results=20):
    try:
        results = service.users().messages().list(userId='me', maxResults=max_results).execute()
        messages = results.get('messages', [])
        emails = []

        for msg in messages:
            try:
                msg_data = service.users().messages().get(userId='me', id=msg['id'], format='full').execute()
                payload = msg_data['payload']
                headers = payload.get("headers", [])
                subject = next((h['value'] for h in headers if h['name'] == 'Subject'), "(No Subject)")

                # Extract sender information
                sender = next((h['value'] for h in headers if h['name'] == 'From'), "")

                # Extract body text more reliably
                body = extract_email_body(payload)

                # Clean the body text
                body = clean_text(body)

                emails.append({
                    'id': msg['id'],
                    'subject': subject,
                    'sender': sender,
                    'text': body
                })
            except Exception as e:
                print(f"Error processing email {msg['id']}: {str(e)}")
                continue

        return emails
    except Exception as e:
        print(f"Error fetching emails: {str(e)}")
        return []

def extract_email_body(payload):
    """Extract email body from payload"""
    body = ""

    if 'parts' in payload:
        # Multipart email
        for part in payload['parts']:
            if part['mimeType'] == 'text/plain':
                if 'data' in part['body']:
                    body += base64.urlsafe_b64decode(part['body']['data']).decode('utf-8', errors='ignore')
            elif part['mimeType'] == 'text/html' and not body:
                # Fallback to HTML if plain text not available
                if 'data' in part['body']:
                    html_content = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8', errors='ignore')
                    # Simple HTML tag removal
                    body = re.sub('<[^<]+?>', '', html_content)
    elif 'body' in payload and 'data' in payload['body']:
        # Single part email
        body = base64.urlsafe_b64decode(payload['body']['data']).decode('utf-8', errors='ignore')

    return body

def clean_text(text):
    """Clean and normalize text"""
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s.,!?@-]', '', text)
    return text.strip()

# ---------- Action based on prediction ----------
def process_email(service, msg_id, label, confidence, subject, sender):
    label = label.lower()
    print(f"Processing: {label} (confidence: {confidence:.2f})")

    try:
        #  Star career-related mails
        if "career" in label or "job" in label or "recruitment" in label:
            service.users().messages().modify(
                userId='me',
                id=msg_id,
                body={'addLabelIds': ['STARRED']}
            ).execute()
            print(" Career mail starred!")


        #  Move promotions/spam to Trash
        elif any(term in label for term in ["promotion", "spam", "advertisement", "newsletter"]):
            # Additional check for common promotional senders
            promotional_terms = ["amazon", "flipkart", "canva", "shopping", "deal", "offer", "discount"]
            subject_lower = subject.lower()
            sender_lower = sender.lower()

            if any(term in subject_lower or term in sender_lower for term in promotional_terms):
                service.users().messages().trash(userId='me', id=msg_id).execute()
                print(" Promotional mail moved to Trash!")
            else:
                print(" Suspicious mail kept for review")

        #  Keep personal mails as-is
        elif "personal" in label or "family" in label or "friend" in label:
            print(" Personal mail kept in Inbox!")

        #  Default action for unclassified emails
        else:
            print(" Mail kept in Inbox (default)")

    except Exception as e:
        print(f" Error processing email {msg_id}: {str(e)}")

# ---------- Main Agent ----------
def run_email_agent(service):
    emails = get_recent_emails(service, max_results=20)
    print(f" Checking {len(emails)} recent emails...\n")

    if not emails:
        print("No emails found or error fetching emails.")
        return

    for mail in emails:
        try:
            # Combine subject and first 500 chars of body for classification
            text_for_classification = f"{mail['subject']} {mail['text'][:500]}".strip()

            if not text_for_classification.strip():
                print(f" Skipping empty email: {mail['subject']}")
                continue

            # Get prediction
            prediction = classifier(text_for_classification, truncation=True, max_length=512)[0]
            label = prediction['label']
            confidence = prediction['score']

            print(f"\n{'='*50}")
            print(f" Subject: {mail['subject']}")
            print(f" From: {mail['sender']}")
            print(f" Predicted: {label} (confidence: {confidence:.2f})")
            print(f"{'='*50}")

            # Process the email
            process_email(service, mail['id'], label, confidence, mail['subject'], mail['sender'])

        except Exception as e:
            print(f" Error classifying email '{mail['subject']}': {str(e)}")
            continue

    print(f"\n Processed {len(emails)} emails successfully!")

# Usage
if __name__ == "__main__":
    # Make sure you have the 'service' object from your authentication
    run_email_agent(service)

Device set to use cpu


 Checking 20 recent emails...


 Subject: [GitHub] A Google identity was just linked to your GitHub account.
 From: GitHub <noreply@github.com>
 Predicted: LABEL_2 (confidence: 0.50)
Processing: label_2 (confidence: 0.50)
 Mail kept in Inbox (default)

 Subject: [GitHub] Please verify your device
 From: GitHub <noreply@github.com>
 Predicted: LABEL_0 (confidence: 0.41)
Processing: label_0 (confidence: 0.41)
 Mail kept in Inbox (default)

 Subject: ü§ñ Meet your AI agent, ready to help 24/7
 From: Text Team <support@text.com>
 Predicted: LABEL_2 (confidence: 0.56)
Processing: label_2 (confidence: 0.56)
 Mail kept in Inbox (default)

 Subject: Organise and uncover your team's big ideas
 From: Canva <marketing@engage.canva.com>
 Predicted: LABEL_2 (confidence: 0.47)
Processing: label_2 (confidence: 0.47)
 Mail kept in Inbox (default)

 Subject: 
 From: Jamalpur Sai pranathy <saipranathyjamalpur@gmail.com>
 Predicted: LABEL_2 (confidence: 0.41)
Processing: label_2 (confidence: 0.41)
 Mail