In [56]:
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import base64
import pandas as pd
import time
import os
import json
from openai import AzureOpenAI
import spacy
import openai
from dotenv import load_dotenv

In [57]:
def mailContent():
    
    ''' Function to Extract the Email content for the Last 12 hours'''
    
    SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
    creds = None
    # The file token.json stores the user's access and refresh tokens, and is
    if os.path.exists("token.json"):
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open("token.json", "w") as token:
            token.write(creds.to_json())

    try:
        df = pd.DataFrame()
        service = build("gmail", "v1", credentials=creds) # Call the Gmail API
        twelve_hours_ago = int(time.time()) - 12 * 3600  # Filter emails from last 12 hours
        query = f"after:{twelve_hours_ago}"
        results = (
            service.users().messages().list(userId="me", labelIds=["INBOX"], q=query).execute() 
        )  # , "CATEGORY_PERSONAL"
        messages = results.get("messages", [])

        if not messages:
            print("No messages found.")
            return
        
        i=0
        for message in messages:
            df.loc[i,'MessageId'] = message["id"] # Email Message Id
            msg = (
                service.users().messages().get(userId="me", id=message["id"]).execute()
            )
            payload = msg['payload']
            headers = payload['headers']
            df.loc[i,"Date"] = next((header['value'] for header in headers if header['name'] == 'Date'), None) # The first Email Received Date
            df.loc[i,'Subject'] = next((header['value'] for header in headers if header['name'] == 'Subject'), None) # The first Mail Subject
            df.loc[i,'sender'] = next((header['value'] for header in headers if header['name'] == 'From'), None) # Sender Email Address
            
            #Retrieve Email Body
            parts = payload.get('parts')
            if parts and 'data' in parts[0].get('body', {}):
                data = parts[0]['body']['data']
                df.loc[i,'body'] = base64.urlsafe_b64decode(data).decode('utf-8')
            else:
                body = payload['body'].get('data')
                if body:
                    df.loc[i,'body'] = base64.urlsafe_b64decode(body).decode('utf-8')
                else:
                    df.loc[i,'body'] = ''
            i=i+1
        return df
    except HttpError as error:
        print(f"An error occurred: {error}")
        return



In [76]:
result = mailContent()
result

Unnamed: 0,MessageId,Date,Subject,sender,body
0,19910c138642f6c8,"Wed, 03 Sep 2025 18:05:21 +0000 (UTC)",[REMOTE] Go-to-Market Team Member - Zapier ($7...,Elizabeth <elizabeth@bandana.com>,Hi there!\r\n\r\n*Zapier* has opened up hiring...
1,19910b324236b8d5,"Wed, 3 Sep 2025 17:49:58 +0000",Statistics Department Grader Position: MSDS In...,MSDS Intranet <notifications@instructure.com>,*************************************\r\nStati...
2,19910b10a2bcbc96,"Wed, 3 Sep 2025 17:32:44 +0000",Rutgers Health Service Corps – Accepting Appli...,Rutgers Health Service Corps <rhsc@rbhs.rutger...,
3,199108a9a37a7463,"Wed, 03 Sep 2025 17:05:41 +0000",Thank you for applying with The Longwood Colle...,longwoodcollective@theapplicantmanager.com,"Dear Shruti,\r\n\r\nThank you&nbsp;for your in..."
4,1991077f8fcfbb59,"Wed, 03 Sep 2025 16:45:20 +0000 (UTC)",Google - Tech Apprenticeships,Imani <imani@bandana.com>,18 Month Free Training\r\n\r\nHi there!\r\n\r\...
5,199104beec2fec82,"Wed, 03 Sep 2025 15:57:15 +0000",Thank you for your interest and application to...,2nd Order Solutions <no-reply@hire.lever.co>,"<div>\r\n<div>Hi Shruti,</div>\r\n<div>&nbsp;<..."
6,1991045009c82211,"Wed, 03 Sep 2025 15:49:41 +0000",Thank you for your interest and application to...,2nd Order Solutions <no-reply@hire.lever.co>,"<div>\r\n<div>Hi Shruti,</div>\r\n<div>&nbsp;<..."
7,199103425c5426a9,"Wed, 03 Sep 2025 15:31:16 +0000 (UTC)",175+ Paid Internships — $150K & $62/hr Roles C...,CareerCamper <careercamper@mail.beehiiv.com>,# Welcome to this week’s CareerCamper Internsh...
8,199100050910adf2,"Wed, 03 Sep 2025 14:34:39 +0000",Gemini | Thanks for Your Interest,careers@gemini.com,"Hi Shruti,\r\n\r\nThank you for applying to th..."
9,1990f7c0f698b3e1,"Wed, 03 Sep 2025 12:10:10 +0000",Grammarly Pro has a new vibe—get 50% off today 🔥,Grammarly <hello@mail.grammarly.com>,Grammarly Discover a smarter surface for schoo...


In [77]:
nlp = spacy.load("en_core_web_trf")

def extract_companies(text):
    
    ''' Function to get Company Name'''
    
    doc = nlp(text['sender'])
    comp = [entity.text for entity in doc.ents if entity.label_ == "ORG"]
    # If the company Name isn't present in the Sender ID
    if  comp== []:
        doc = nlp(text['Subject'])
        comp = [entity.text for entity in doc.ents if entity.label_ == "ORG"]
        # If the company Name isn't present in the Email Subject
        if comp == []:
            domain= text['sender'].split('@')[1].strip()
            if len(domain.split('.')) >= 2:
                return domain.split('.')[-2]
            else:
                return domain
        else:
            return ', '.join(comp)
    else:
        return ', '.join(comp)


In [78]:
def extract_role(email_body):
    
    ''' Function to extract role and the applied and it's Status'''
    
    #User Prompt
    prompt = f'''
    Below is an email received by a user.

    Body:
    "{email_body[:15556]}"

    Your task is to determine whether this email is related to a job application. Specifically, check if it includes:

    - A confirmation of a job application submission
    - A status update regarding the application
    - A rejection notice
    - An interview schedule notification

    If it is related, return ONLY the following **raw JSON** object—without markdown, text, or any extra formatting:

    {{
        "Role": "The job title or role the user applied for",
        "Status": "Current status of the job application"
    }}

    **Valid values for "Status" are:**
    - "Applied"
    - "Interview Scheduled"
    - "Rejected"
    
    If the email is not related to a job application, return NOTHING (not even an empty string, explanation, or JSON).
'''
    client = AzureOpenAI(
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), 
        api_key=os.getenv("AZURE_OPENAI_API_KEY"), 
        api_version="2025-01-01-preview",
    )
    response = client.chat.completions.create(
        model="gpt-4o",  
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    content = response.choices[0].message.content.strip()
    
    # Basic validation before parsing
    if not content.startswith("{"):
        print("Model returned non-JSON output:", content)
        return None

    try:
        return json.loads(content)
    except json.JSONDecodeError as e:
        print(" JSON parsing error:", e)
        print("Returned content:", content)
        return None
    
    
#print(extract_role(result.loc[1, 'body']))


{'Role': 'Statistics Department Grader', 'Status': 'Applied'}


In [94]:
def JobTracker(df):    
    result = mailContent()
    
    for ind in range(len(result)): 
        print(f'{ind+1}/{len(result)}')
        #Extract Company Name
        comp=extract_companies(result.loc[ind])
        #Extract Role and Status of the Job Application
        response = extract_role(result.loc[ind, 'body'])
        #If response is None ignore email(Email not related to Job Application)
        if response is None:
            continue
        mask = (df["CompanyName"] == comp) & (df["Role"] == response.get('Role'))
        if df[mask].empty:
            # Add a new row
            new_row = pd.DataFrame([{"AppliedDate": result.loc[ind, "Date"],"CompanyName": comp,"Role": response.get('Role'),"Status": response.get('Status'),
                "LastUpdated": result.loc[ind, "Date"]}])
            df = pd.concat([df, new_row], ignore_index=True)
        else:
            # Update the existing row
            df.loc[mask, ['Status', 'LastUpdated']] = [
                response.get('Status'), result.loc[ind, 'Date']
            ]
            time.sleep(1)
    df.to_csv('TrackerUpdate.csv',index=False)

In [97]:
file_path = "TrackerUpdate.csv"

if os.path.exists(file_path):
    #Update Job Tracker
    df = pd.read_csv(file_path)
    JobTracker(df)
else:
    #Create New CSV if its run the first time
    df = pd.DataFrame({'AppliedDate': pd.Series(dtype='str'),'CompanyName': pd.Series(dtype='str'),'Role': pd.Series(dtype='str'),
                        'Status': pd.Series(dtype='str'),'LastUpdated': pd.Series(dtype='str')})
    JobTracker(df)

1/14
Model returned non-JSON output: 
2/14
Model returned non-JSON output: 
3/14
4/14
Model returned non-JSON output: 
5/14
6/14
Model returned non-JSON output: 
7/14
8/14
9/14
Model returned non-JSON output: 
10/14
11/14
Model returned non-JSON output: 
12/14
Model returned non-JSON output: 
13/14
Model returned non-JSON output: 
14/14
Model returned non-JSON output: 
