In [2]:
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import base64
import pandas as pd
import time
import os
import json
from openai import AzureOpenAI
import spacy
import openai

In [3]:
def mailContent():
    
    ''' Function to Extract the Email content for the Last 12 hours'''
    
    SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
    creds = None
    # The file token.json stores the user's access and refresh tokens, and is
    if os.path.exists("token.json"):
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open("token.json", "w") as token:
            token.write(creds.to_json())

    try:
        df = pd.DataFrame()
        service = build("gmail", "v1", credentials=creds) # Call the Gmail API
        twelve_hours_ago = int(time.time()) - 12 * 3600  # Filter emails from last 12 hours
        query = f"after:{twelve_hours_ago}"
        results = (
            service.users().messages().list(userId="me", labelIds=["INBOX"], q=query).execute() 
        )  # , "CATEGORY_PERSONAL"
        messages = results.get("messages", [])

        if not messages:
            print("No messages found.")
            return
        
        i=0
        for message in messages:
            df.loc[i,'MessageId'] = message["id"] # Email Message Id
            msg = (
                service.users().messages().get(userId="me", id=message["id"]).execute()
            )
            payload = msg['payload']
            headers = payload['headers']
            df.loc[i,"Date"] = next((header['value'] for header in headers if header['name'] == 'Date'), None) # The first Email Received Date
            df.loc[i,'Subject'] = next((header['value'] for header in headers if header['name'] == 'Subject'), None) # The first Mail Subject
            df.loc[i,'sender'] = next((header['value'] for header in headers if header['name'] == 'From'), None) # Sender Email Address
            
            #Retrieve Email Body
            parts = payload.get('parts')
            if parts and 'data' in parts[0].get('body', {}):
                data = parts[0]['body']['data']
                df.loc[i,'body'] = base64.urlsafe_b64decode(data).decode('utf-8')
            else:
                body = payload['body'].get('data')
                if body:
                    df.loc[i,'body'] = base64.urlsafe_b64decode(body).decode('utf-8')
                else:
                    df.loc[i,'body'] = ''
            i=i+1
        return df
    except HttpError as error:
        print(f"An error occurred: {error}")
        return



In [4]:
result = mailContent()
result

Unnamed: 0,MessageId,Date,Subject,sender,body
0,1991126c5f350193,"Wed, 03 Sep 2025 19:56:17 +0000","Jobs like ""Data Analytics Intern (SUMMER 2024)...",Jobs beBee <alert@notification.bebee.com>,"<html><head><meta http-equiv=""Content-Type"" co..."
1,199111cc730ab50b,"Wed, 03 Sep 2025 19:45:21 +0000 (UTC)",[Remote] Trunk Tools - Customer Success ($67.3...,Elizabeth <elizabeth@bandana.com>,This role offers equity!\r\n\r\nHi there!\r\n\...
2,19911139a85320e4,"Wed, 03 Sep 2025 19:35:20 +0000 (UTC)",Hudson River Trading Internship - $145/HR,Rudy <rudy@bandana.com>,Hey!\r\n\r\n*Hudson River Trading* are hiring ...
3,1991112fda8f79c4,"Wed, 03 Sep 2025 19:34:40 +0000",Hot job at Precision Pulley & Idler—worth a lo...,Handshake <handshake@notifications.joinhandsha...,Precision Pulley & Idler is hiring—this could ...
4,19910fb3fd0fe1d8,"Wed, 03 Sep 2025 19:08:44 +0000 (UTC)",RippleMatch - 🎨🚀 Market the Future of GIS – Pr...,The RippleMatch Team <jobmatches@ripplematch.com>,"<p>Hi Shruti,</p><p>This summer, join <strong>..."
5,19910f1068977d0e,"Wed, 03 Sep 2025 18:56:17 +0000",New jobs posted from Gallo,ejgallo-jobnotification@noreply.jobs2web.com,"<span style=""margin-bottom:11.0px; font-size:1..."
6,19910eaca6bbdb9c,"Wed, 03 Sep 2025 18:50:46 +0000 (UTC)",[Remote] Virta Health - Customer Specialist,Imani <imani@bandana.com>,Hi there!\r\n\r\n*Virta Health* is hiring for ...
7,19910e05e78bea55,"Wed, 03 Sep 2025 18:39:23 +0000","Shruti: Rutgers University, Stay At Home Stuff...","""New Brunswick Jobs (via Jobcase)"" <email@umai...",https://obct.jobcase.com/t/?d=fVmu4auEcxi1548k...
8,19910df2fde3348b,"Wed, 03 Sep 2025 18:38:04 +0000","Shruti, Incedo is hiring.","""Incedo (via Jobcase)"" <email@umail.jobcase.com>",Job Alert!\r\n\r\n Data Analyst\r\n NEW BR...
9,19910c8d0c66ad7e,"Wed, 3 Sep 2025 18:13:38 +0000",GEMMA MORAN (2025FA - STATS LEARN DS 16:954:53...,GEMMA MORAN <notifications@instructure.com>,\r\n Subject: 2025FA - STATS LEARN DS 16:954:...


In [6]:
nlp = spacy.load("en_core_web_trf")

def extract_companies(text):
    
    ''' Function to get Company Name'''
    
    doc = nlp(text)
    return [entity.text for entity in doc.ents if entity.label_ == "ORG"]


In [7]:
def extract_role(email_body):
    ''' Function to extract role and the applied and it's Status'''
    
    #User Prompt
    prompt = f'''
    Below is an email received by a user.

    Body:
    "{email_body[:15556]}"

    Your task is to determine whether this email is related to a job application. Specifically, check if it includes:

    - A confirmation of a job application submission
    - A status update regarding the application
    - A rejection notice
    - An interview schedule notification

    If it is related, return ONLY the following **raw JSON** object—without markdown, text, or any extra formatting:

    {{
        "Role": "The job title or role the user applied for",
        "Status": "Current status of the job application"
    }}

    **Valid values for "Status" are:**
    - "Applied"
    - "Interview Scheduled"
    - "Rejected"
    
    If the email is not related to a job application, return NOTHING (not even an empty string, explanation, or JSON).
'''
    client = AzureOpenAI(
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), 
        api_key=os.getenv("AZURE_OPENAI_API_KEY"), 
        api_version="2025-01-01-preview",
    )
    response = client.chat.completions.create(
        model="gpt-4o",  
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    content = response.choices[0].message.content.strip()
    
    # Basic validation before parsing
    if not content.startswith("{"):
        print("Model returned non-JSON output:", content)
        return None

    try:
        return json.loads(content)
    except json.JSONDecodeError as e:
        print(" JSON parsing error:", e)
        print("Returned content:", content)
        return None
    
    
#print(extract_role(result.loc[0, 'body']))


In [144]:
def JobTracker(df):    
    result = mailContent()
    if df.empty:
        df = pd.DataFrame({'AppliedDate': pd.Series(dtype='str'),'CompanyName': pd.Series(dtype='str'),'Role': pd.Series(dtype='str'),
                        'Status': pd.Series(dtype='str'),'LastUpdated': pd.Series(dtype='str')})
    for ind in range(len(result)): #len(result)
        print(f'{ind+1}/{len(result)}')
        comp=extract_companies(result.loc[ind, 'sender'])
        comp= ', '.join(comp)
        #user_prompt = get_user_prompt(result.loc[ind, 'body'])
        response = extract_role(result.loc[ind, 'body'])
        print(response)
        #print(response.get('Role'))
        if comp == '':
            domain= result.loc[ind, 'sender'].split('@')[1].strip()
            if len(domain.split('.')) >= 2:
                comp = domain.split('.')[-2]
            else:
                comp = domain
        if (response.get('Role') is None) or (response.get('Role')==''):
            continue
        mask = (df["CompanyName"] == comp) & (df["Role"] == response.get('role'))

        if df[mask].empty:
            # Add a new row
            new_row = pd.DataFrame([{"AppliedDate": result.loc[ind, "Date"],"CompanyName": comp,"Role": response.get('Role'),"Status": response.get('Status'),
                "LastUpdated": result.loc[ind, "Date"]}])
            df = pd.concat([df, new_row], ignore_index=True)
        else:
            # Update the existing row
            df.loc[mask, ['Status', 'LastUpdated']] = [
                response.get('Status'), result.loc[ind, 'Date']
            ]
            time.sleep(1)
    df.to_csv('TrackerUpdate.csv')

In [145]:
df=pd.read_csv('TrackerUpdate.csv', index_col=False)
JobTracker(df)

1/10
{'Role': 'Data Intern', 'Status': 'Rejected'}
2/10
{'Role': 'Tech Apprenticeships', 'Status': 'Applied'}
3/10
{'Role': 'Not provided', 'Status': 'Applied'}
4/10
{'Role': 'Not provided', 'Status': 'Applied'}
5/10
{'Role': None, 'Status': None}
6/10
{'Role': 'Analytics Engineer', 'Status': 'Rejected'}
7/10
{'Role': '', 'Status': ''}
8/10
{'Role': None, 'Status': None}
9/10
{'Role': '', 'Status': ''}
10/10
{'Role': '', 'Status': ''}
