## This is automated visa application processing system using AI.

##### This project will use multi AI model which will assist visually impaired person in filling Visa. Capture his image and then transform into EVisa


In [18]:
import os
import json
import base64
from io import BytesIO
from PIL import Image
from IPython.display import Audio, display
import gradio as gr
from dotenv import load_dotenv
from openai import OpenAI
from transformers import pipeline



Collecting SpeechRecognition
  Using cached speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Using cached speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
Installing collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.3


In [6]:
# Initialization

load_dotenv(override=True)

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")


    
    
MODEL = "gpt-4o-mini"
openai = OpenAI()

OpenAI API Key exists and begins sk-proj-


In [7]:
#

from pydub import AudioSegment
from pydub.playback import play

def talker(message):
    response = openai.audio.speech.create(
      model="tts-1",
      voice="onyx",    # Also, try replacing onyx with alloy
      input=message
    )
    
    audio_stream = BytesIO(response.content)
    audio = AudioSegment.from_file(audio_stream, format="mp3")
    play(audio)

In [8]:
system_message = "You are a helpful assistant for an E Visa application called Eazee. "
system_message += "Give short, courteous answers, no more than 1 sentence. "
system_message += "Always be accurate. If you don't know the answer, say so."

In [9]:
import os
import json
import base64
from io import BytesIO
from dotenv import load_dotenv
from openai import OpenAI
from fpdf import FPDF
import cv2
import gradio as gr
from pydub import AudioSegment
from pydub.playback import play
import time

# --- Initialization ---
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)
MODEL = "gpt-4o-mini"

def talker(message):
    response = openai.audio.speech.create(
        model="tts-1",
        voice="onyx",
        input=message
    )
    audio_stream = BytesIO(response.content)
    audio = AudioSegment.from_file(audio_stream, format="mp3")
    play(audio)

def auto_capture_photo(img_path="visa_photo.jpg"):
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        raise Exception("Could not open camera")
    ret, frame = cap.read()
    cap.release()
    if not ret:
        raise Exception("Failed to capture image")
    cv2.imwrite(img_path, frame)
    # Encode image as base64
    with open(img_path, "rb") as f:
        photo_b64 = base64.b64encode(f.read()).decode()
    return img_path, photo_b64

def create_visa_application_pdf(firstname, lastname, address, gender, img_path):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=14)
    pdf.cell(200, 10, "E-Visa Application Form", ln=True, align="C")
    pdf.ln(10)
    pdf.cell(50, 10, f"First Name: {firstname}", ln=True)
    pdf.cell(50, 10, f"Last Name: {lastname}", ln=True)
    pdf.cell(50, 10, f"Gender: {gender}", ln=True)
    pdf.multi_cell(0, 10, f"Address: {address}", align="L")
    pdf.ln(10)
    pdf.cell(50, 10, "Applicant Photo:", ln=True)
    if os.path.exists(img_path):
        pdf.image(img_path, x=pdf.get_x(), y=pdf.get_y(), w=40)
    pdf_path = "visa_application.pdf"
    pdf.output(pdf_path)
    return pdf_path

Visa_function = {
    "name": "create_visa_application_pdf",
    "description": (
        "Generate a visa application PDF. "
        "Call this whenever you need to create a visa application, for example when a customer asks 'Can you help with my visa application?'"
    ),
    "parameters": {
        "type": "object",
        "properties": {
            "firstname": {"type": "string", "description": "The applicant's first name"},
            "lastname": {"type": "string", "description": "The applicant's last name"},
            "address": {"type": "string", "description": "The applicant's address"},
            "gender": {"type": "string", "description": "The applicant's gender"},
            "photo": {"type": "string", "description": "Base64-encoded image of the applicant's photo"}
        },
        "required": ["firstname", "lastname", "address", "gender", "photo"],
        "additionalProperties": False
    }
}
tools = [{"type": "function", "function": Visa_function}]

def handle_tool_call(tool_call):
    args = json.loads(tool_call.function.arguments)
    firstname = args.get("firstname")
    lastname = args.get("lastname")
    address = args.get("address")
    gender = args.get("gender")
    photo_b64 = args.get("photo")
    img_path = "visa_photo.jpg"
    with open(img_path, "wb") as f:
        f.write(base64.b64decode(photo_b64))
    pdf_path = create_visa_application_pdf(firstname, lastname, address, gender, img_path)
    return pdf_path

def chat_with_openai(audio_input, history, photo_b64):
    user_text = ""
    if audio_input is not None:
        import speech_recognition as sr
        recognizer = sr.Recognizer()
        with sr.AudioFile(audio_input) as source:
            audio = recognizer.record(source)
        try:
            user_text = recognizer.recognize_google(audio)
            print(user_text)
        except Exception:
            user_text = ""
    if not user_text:
        user_text = " "  # Avoid empty input

    messages = [{"role": "system", "content": "You are a helpful assistant for E-Visa applications. Collect all required info and call the function when ready."}]
    messages += history
    messages.append({"role": "user", "content": user_text})

    response = openai.chat.completions.create(
        model=MODEL,
        messages=messages,
        tools=tools,
        tool_choice="auto"
    )
    reply = response.choices[0].message

    # Speak the assistant's reply
    if reply.content:
        talker(reply.content)

    # If OpenAI wants to call the function
    if reply.tool_calls:
        tool_call = reply.tool_calls[0]
        args = json.loads(tool_call.function.arguments)
        if not args.get("photo") and not photo_b64:
            talker("I will now take your photo. Please look at the camera.")
            img_path, photo_b64 = auto_capture_photo()
            args["photo"] = photo_b64
            pdf_path = handle_tool_call(tool_call)
            talker("Your visa application is ready. You can download it now.")
            return messages, photo_b64, pdf_path, "Your visa application is ready. Download below.", gr.update(value=None)
        elif not args.get("photo"):
            args["photo"] = photo_b64
        pdf_path = handle_tool_call(tool_call)
        talker("Your visa application is ready. You can download it now.")
        return messages, photo_b64, pdf_path, "Your visa application is ready. Download below.", gr.update(value=None)
    else:
        return messages, photo_b64, None, reply.content, gr.update(value=None)

with gr.Blocks() as demo:
    gr.Markdown("## Eazee: Voice-Driven E-Visa Assistant for the Visually Impaired")
    history = gr.State([])
    photo_b64 = gr.State(None)
    audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Speak to the assistant", autoplay=True, streaming=True)
    output_text = gr.Textbox(label="Assistant", interactive=False)
    pdf_file = gr.File(label="Download Visa Application PDF")

    def auto_submit(audio, history, photo_b64):
        # Wait a moment to simulate "pause" after recording
        time.sleep(1.5)
        return chat_with_openai(audio, history, photo_b64)

    audio_input.change(
        auto_submit,
        inputs=[audio_input, history, photo_b64],
        outputs=[history, photo_b64, pdf_file, output_text, audio_input]
    )

    # Optionally, add a manual button for fallback
    gr.Markdown("If you need to repeat, just speak again.")

demo.launch()

* Running on local URL:  http://127.0.0.1:7876
* To create a public link, set `share=True` in `launch()`.




In [21]:
import os
import json
import base64
from io import BytesIO
from dotenv import load_dotenv
from openai import OpenAI
from fpdf import FPDF
import cv2
import gradio as gr
from pydub import AudioSegment
from pydub.playback import play
import time
import logging
import numpy as np
import soundfile as sf
from transformers import pipeline

# --- Logging Setup ---
logging.basicConfig(filename='debug.log', level=logging.DEBUG, format='%(asctime)s %(levelname)s:%(message)s')

# --- Initialization ---
try:
    load_dotenv(override=True)
    openai_api_key = os.getenv('OPENAI_API_KEY')
    openai = OpenAI(api_key=openai_api_key)
    MODEL = "gpt-4o-mini"
    logging.info("Initialization successful.")
except Exception as e:
    logging.error(f"Initialization failed: {e}")


def transcribeMain(audio):
    if audio is None:
        return "Error: No audio input received."
    
    try:
        # Load the audio file
        with sr.AudioFile(audio) as source:
            recognizer.adjust_for_ambient_noise(source)  # Reduce background noise
            audio_data = recognizer.record(source)  # Capture the entire audio file
        
        # Perform speech recognition
        text = recognizer.recognize_google(audio_data)
        logging.info(text)   # Using Google Speech Recognition
        return text
    
    except sr.UnknownValueError:
        return "Error: Could not understand the audio."
    except sr.RequestError:
        return "Error: Could not request results from the speech recognition service."
    except Exception as e:
        return f"Error processing audio: {str(e)}"


# Load the speech-to-text model
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

# Define the function to process audio input
def transcribe(audio):
    if audio is None:
        return "Error: No audio input received."
    
    try:
        # Load and process the audio file
        audio_data, samplerate = sf.read(audio)
        
        if not isinstance(audio_data, np.ndarray):
            return "Error: Invalid audio format."
        
        return asr_pipeline(audio)["text"]
    
    except Exception as e:
        return f"Error processing audio: {str(e)}"
    
def talker(message):
    try:
        response = openai.audio.speech.create(
            model="tts-1",
            voice="onyx",
            input=message
        )
        audio_stream = BytesIO(response.content)
        audio = AudioSegment.from_file(audio_stream, format="mp3")
        play(audio)
        logging.info("Spoke message successfully.")
    except Exception as e:
        logging.error(f"Talker error: {e}")

def auto_capture_photo(img_path="visa_photo.jpg"):
    try:
        cap = cv2.VideoCapture(0)
        if not cap.isOpened():
            raise Exception("Could not open camera")
        ret, frame = cap.read()
        cap.release()
        if not ret:
            raise Exception("Failed to capture image")
        cv2.imwrite(img_path, frame)
        with open(img_path, "rb") as f:
            photo_b64 = base64.b64encode(f.read()).decode()
        logging.info("Photo captured and encoded.")
        return img_path, photo_b64
    except Exception as e:
        logging.error(f"Photo capture error: {e}")
        raise

def create_visa_application_pdf(firstname, lastname, address, gender, img_path):
    try:
        pdf = FPDF()
        pdf.add_page()
        pdf.set_font("Arial", size=14)
        pdf.cell(200, 10, "E-Visa Application Form", ln=True, align="C")
        pdf.ln(10)
        pdf.cell(50, 10, f"First Name: {firstname}", ln=True)
        pdf.cell(50, 10, f"Last Name: {lastname}", ln=True)
        pdf.cell(50, 10, f"Gender: {gender}", ln=True)
        pdf.multi_cell(0, 10, f"Address: {address}", align="L")
        pdf.ln(10)
        pdf.cell(50, 10, "Applicant Photo:", ln=True)
        if os.path.exists(img_path):
            pdf.image(img_path, x=pdf.get_x(), y=pdf.get_y(), w=40)
        pdf_path = "visa_application.pdf"
        pdf.output(pdf_path)
        logging.info("PDF created successfully.")
        return pdf_path
    except Exception as e:
        logging.error(f"PDF creation error: {e}")
        raise

Visa_function = {
    "name": "create_visa_application_pdf",
    "description": (
        "Generate a visa application PDF. "
        "Call this whenever you need to create a visa application, for example when a customer asks 'Can you help with my visa application?'"
    ),
    "parameters": {
        "type": "object",
        "properties": {
            "firstname": {"type": "string", "description": "The applicant's first name"},
            "lastname": {"type": "string", "description": "The applicant's last name"},
            "address": {"type": "string", "description": "The applicant's address"},
            "gender": {"type": "string", "description": "The applicant's gender"},
            "photo": {"type": "string", "description": "Base64-encoded image of the applicant's photo"}
        },
        "required": ["firstname", "lastname", "address", "gender", "photo"],
        "additionalProperties": False
    }
}
tools = [{"type": "function", "function": Visa_function}]

def handle_tool_call(tool_call):
    try:
        args = json.loads(tool_call.function.arguments)
        firstname = args.get("firstname")
        lastname = args.get("lastname")
        address = args.get("address")
        gender = args.get("gender")
        photo_b64 = args.get("photo")
        img_path = "visa_photo.jpg"
        with open(img_path, "wb") as f:
            f.write(base64.b64decode(photo_b64))
        pdf_path = create_visa_application_pdf(firstname, lastname, address, gender, img_path)
        logging.info("Tool call handled successfully.")
        return pdf_path
    except Exception as e:
        logging.error(f"Tool call error: {e}")
        raise

# --- Main Chat Logic ---
def chat_with_openai(audio_input, history, photo_b64, collected):
    try:
        audio_text = transcribeMain(audio_input)
        user_text = audio_text
        logging.info("Phase 1")
        logging.info(user_text)
            
        if not collected:
            collected = {"firstname": None, "lastname": None, "address": None, "gender": None}

      

        text = user_text.lower()
        if not collected["firstname"] and "first name" in text:
            collected["firstname"] = user_text.split("first name")[-1].strip(":,. ")
            print("Rahul Search Here : "+collected["firstname"])
            logging.info("Phase 4")
            logging.info("Rahul Search Here : "+collected["firstname"])
        elif not collected["lastname"] and "last name" in text:
            collected["lastname"] = user_text.split("last name")[-1].strip(":,. ")
            print("Rahul Search Here : "+collected["lastname"])
            logging.info("Rahul Search Here : "+collected["lastname"])
        elif not collected["address"] and "address" in text:
            collected["address"] = user_text.split("address")[-1].strip(":,. ")
            print("Rahul Search Here : "+collected["address"])
            logging.info("Rahul Search Here : "+collected["address"])
        elif not collected["gender"] and "gender" in text:
            collected["gender"] = user_text.split("gender")[-1].strip(":,. ")
            print("Rahul Search Here : "+collected["gender"])
            logging.info("Rahul Search Here : "+collected["gender"])
            logging.debug(f"Gender extracted: {collected['gender']}")
        elif history:
            last_assistant = history[-1][1].lower() if isinstance(history[-1], (list, tuple)) else ""
            if "first name" in last_assistant and not collected["firstname"]:
                collected["firstname"] = user_text.strip()
                print("Rahul Search Here : "+collected["firstname"])
                logging.info("Rahul Search Here : "+collected["firstname"])
                logging.info("Phase 5")
            elif "last name" in last_assistant and not collected["lastname"]:
                collected["lastname"] = user_text.strip()
                print("Rahul Search Here : "+collected["lastname"])
                logging.info("Rahul Search Here : "+collected["lastname"])
            elif "address" in last_assistant and not collected["address"]:
                collected["address"] = user_text.strip()
                print("Rahul Search Here : "+collected["address"])
                logging.info("Rahul Search Here : "+collected["address"])
            elif "gender" in last_assistant and not collected["gender"]:
                collected["gender"] = user_text.strip()
                print("Rahul Search Here : "+collected["gender"])
                logging.info("Rahul Search Here : "+collected["gender"])
                logging.debug(f"Gender extracted: {collected['gender']}")
                logging.debug(f"Gender extracted (history): {collected['gender']}")
                

        
        if all(collected.values()):
            talker("Thank you. I will now take your photo. Please look at the camera.")
            print("Rahul Photo section Here : ")
            logging.info("Rahul Photo section Here")
            logging.info("Phase 6")
            img_path, photo_b64 = auto_capture_photo()
            pdf_path = create_visa_application_pdf(
                collected["firstname"], collected["lastname"], collected["address"], collected["gender"], img_path
            )
            talker("Your visa application is ready. You can download it now.")
            logging.info("All fields collected and PDF generated.")
            return [], photo_b64, pdf_path, "Your visa application is ready. Download below.", collected

        missing = [k for k, v in collected.items() if not v]
        if not history:
            greeting = "Welcome to Eazee! I will help you fill out your visa application. What is your first name?"
            talker(greeting)
            logging.info("Started new conversation.")
            return [[user_text, greeting]], photo_b64, None, greeting, collected

        system_prompt = (
            "You are a helpful assistant for E-Visa applications. "
            "Collect only first name, last name, address, and gender. "
            "Do not ask for or mention photo. "
            "Ask for only one missing field at a time."
        )
        messages = [{"role": "system", "content": system_prompt}]
        for h in history:
            if isinstance(h, (list, tuple)) and len(h) == 2:
                messages.append({"role": "user", "content": h[0]})
                messages.append({"role": "assistant", "content": h[1]})
                logging.info("Phase 7")
        messages.append({"role": "user", "content": user_text})

        response = openai.chat.completions.create(
            model=MODEL,
            messages=messages
        )
        reply = response.choices[0].message.content
        talker(reply)
        history.append([user_text, reply])
        logging.info("Assistant replied.")
        return history, photo_b64, None, reply, collected
    except Exception as e:
        logging.error(f"Chat error: {e}")
        return history, photo_b64, None, f"Error: {e}", collected

# --- Gradio UI ---
with gr.Blocks() as demo:
    gr.Markdown("## Eazee: Voice-Driven E-Visa Assistant for the Visually Impaired")
    history = gr.State([])
    photo_b64 = gr.State(None)
    collected = gr.State({})
    audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Speak to the assistant")
    
    output_text = gr.Textbox(label="Assistant", interactive=False)
    pdf_file = gr.File(label="Download Visa Application PDF")
    submit = gr.Button("Submit/Continue")
    submit.click(
        chat_with_openai,
        inputs=[audio_input, history, photo_b64, collected],
        outputs=[history, photo_b64, pdf_file, output_text, collected]
    )

demo.launch()



Device set to use mps:0


* Running on local URL:  http://127.0.0.1:7869
* To create a public link, set `share=True` in `launch()`.




In [19]:

import gradio as gr
import numpy as np
import soundfile as sf
from transformers import pipeline

# Load the speech-to-text model
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

# Define the function to process audio input
def transcribe(audio):
    if audio is None:
        return "Error: No audio input received."
    
    try:
        # Load and process the audio file
        audio_data, samplerate = sf.read(audio)
        
        if not isinstance(audio_data, np.ndarray):
            return "Error: Invalid audio format."
        
        return asr_pipeline(audio)["text"]
    
    except Exception as e:
        return f"Error processing audio: {str(e)}"

# Create the Gradio interface with both microphone and file upload options
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Speak or upload an audio file")
interface = gr.Interface(fn=transcribe, inputs=audio_input, outputs="text")

# Launch the app
interface.launch()


Device set to use mps:0


* Running on local URL:  http://127.0.0.1:7867
* To create a public link, set `share=True` in `launch()`.




In [20]:
import gradio as gr
import speech_recognition as sr

# Initialize the recognizer
recognizer = sr.Recognizer()

# Function to transcribe audio
def transcribe(audio):
    if audio is None:
        return "Error: No audio input received."
    
    try:
        # Load the audio file
        with sr.AudioFile(audio) as source:
            recognizer.adjust_for_ambient_noise(source)  # Reduce background noise
            audio_data = recognizer.record(source)  # Capture the entire audio file
        
        # Perform speech recognition
        text = recognizer.recognize_google(audio_data)  # Using Google Speech Recognition
        return text
    
    except sr.UnknownValueError:
        return "Error: Could not understand the audio."
    except sr.RequestError:
        return "Error: Could not request results from the speech recognition service."
    except Exception as e:
        return f"Error processing audio: {str(e)}"

# Create the Gradio interface
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Speak or upload an audio file")
interface = gr.Interface(fn=transcribe, inputs=audio_input, outputs="text")

# Launch the app
interface.launch()


* Running on local URL:  http://127.0.0.1:7868
* To create a public link, set `share=True` in `launch()`.


