In [1]:
!pip install pdfplumber
!pip install docx
!pip install docx2txt
!pip install gradio
!pip install transformers
!pip install torch
!pip install accelerate
!pip install sentencepiece
!pip install torch
!pip install pypdf


Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25hDownloading

In [1]:
import gradio as gr
import pdfplumber
import docx2txt
import os
import re
import torch
import csv
import sys
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the model and tokenizer
model_name = "microsoft/Phi-3.5-mini-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float32,
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Create a text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file using pdfplumber."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(docx_path):
    """Extract text from a DOCX file using docx2txt."""
    text = docx2txt.process(docx_path)
    return text

def extract_resume_info(text):
    """Extract key information from the resume text."""
    print("Extracting resume information...")
    text = text.lower()
    headers = {
        'name': 'NAME',
        'skills': 'SKILLS',
        'experience': 'EXPERIENCE',
        'education': 'EDUCATION'
    }
    first_line = text.split('\n', 1)[0].strip()
    name_match = re.match(r'^.*\s*name\s*[:\-]?\s*(.*)', first_line, re.IGNORECASE)
    name = name_match.group(1).strip() if name_match else first_line
    print(f"Extracted Name: {name}")

    for key, value in headers.items():
        text = re.sub(rf'\b{key}\b', value, text, flags=re.IGNORECASE)

    def extract_section(header):
        pattern = rf'\b{header}\b\s*[:\-]?\s*(.*?)(?=\n[A-Z]|$)'
        match = re.search(pattern, text, re.DOTALL)
        result = match.group(1).strip() if match else 'Not Found'
        print(f"Extracted {header}: {result}")
        return result

    skills = extract_section('SKILLS')
    experience = extract_section('EXPERIENCE')
    education = extract_section('EDUCATION')

    # Ensure experience is treated as an integer for comparison
    experience_present = 1 if experience != 'Not Found' else 0
    return name, skills, experience_present, education

def generate_response(prompt):
    """Generate a response using the language model."""
    print(f"Generating response for prompt: {prompt}")
    generation_args = {
        "max_new_tokens": 90,
        "return_full_text": True,
        "temperature": 0.7,
        "do_sample": True,
    }
    output = pipe(prompt, **generation_args)
    generated_text = output[0]['generated_text'].strip()

    # Find all questions that start with a number or with "Question:"
    questions = re.findall(r'(?:\d+\.|\b[qQ]uestion:).*?\?', generated_text)

    print(f"Generated Text: {generated_text}")

    # Return the last question if any, otherwise return a default message
    return questions[-1] if questions else "No relevant question generated."

def generate_response_for_df(df):
    """Generate model responses for each question in the DataFrame."""
    # List to store model responses
    model_responses = []
    
    for i, row in df.iterrows():
        question = row['question']
        print(f"Generating response for question: {question}")
        
        # Generate a response using the existing generate_response logic
        generation_args = {
            "max_new_tokens": 90,
            "return_full_text": True,
            "temperature": 0.7,
            "do_sample": True,
        }
        output = pipe(question, **generation_args)
        generated_text = output[0]['generated_text'].strip()
        
        # Directly use the generated response
        model_response = generated_text
        
        print(f"Model Response: {model_response}")
        model_responses.append(model_response)
    
    # Add the generated model responses to the DataFrame
    df['model_response'] = model_responses
    return df
def calculate_similarity(df):
    """Calculate similarity between user and model responses and return updated DataFrame."""
    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()
    
    similarity_scores = []
    
    for i, row in df.iterrows():
        user_response = row['user_response']
        model_response = row['model_response']
        
        # Compute TF-IDF for the user response and model response
        tfidf_matrix = vectorizer.fit_transform([user_response, model_response])
        
        # Calculate cosine similarity between user and model response
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        
        # Convert similarity to percentage
        similarity_percentage = round(similarity * 100, 2)
        
        print(f"Similarity score for row {i}: {similarity_percentage}%")
        similarity_scores.append(similarity_percentage)
    
    # Add the similarity scores to the DataFrame
    df['similarity'] = similarity_scores
    return df

def process_resume(file_path):
    """Process the resume to extract text and key information."""
    print(f"Processing resume file: {file_path}")
    file_extension = os.path.splitext(file_path)[1].lower()
    if file_extension == '.pdf':
        extracted_text = extract_text_from_pdf(file_path)
    elif file_extension == '.docx':
        extracted_text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file format. Please upload a PDF or DOCX file.")
    print(f"Extracted Text: {extracted_text[:200]}...")  # Show a snippet of the extracted text
    return extracted_text

import pandas as pd

def run_interview(name, skills, experience, user_response=None, questions_and_answers=None, user=None, stage=0):
    """Conduct an interview based on the extracted resume data."""
    if questions_and_answers is None:
        questions_and_answers = []
    if user is None:
        user = []


    print(f"Stage: {stage}")
    print(f"User Response: {user_response}")
    print(questions_and_answers)
    print("User", user)

    if experience and skills:
        if stage == 0:
            print("step: 0")
            experience_question = "Can you describe a challenging project where you applied your experience?"
            questions_and_answers.append([experience_question, ""])
            print(f"Stage 0: Added experience question: {experience_question}")
            stage += 1

        elif stage == 1 and user_response:
            print("step: 1")
            user.append(user_response)
            print(user)
            prompt = f"You are a job interviewer. Please generate only 1 follow-up question starting with 'Question:' WITHOUT any additional text or answer...just the question.. based on this: {user_response}"
            follow_up_question = generate_response(prompt)
            questions_and_answers.append([follow_up_question, ""])
            print(f"Stage 1: Added follow-up question: {follow_up_question}")
            stage += 1

        elif stage == 2 and user_response:
            # Ensure questions_and_answers is empty at the beginning of stage 2
                print ("step2_1")
                user.append(user_response)
                prompt_for_questions = f"Generate 1 interview questions based on the following skills: {skills}. Please provide only the questions focused on the subject's bookish knowledge, with no headings or additional text."
                generated_question = generate_response(prompt_for_questions)
                questions_and_answers.append([generated_question,""])
                stage += 1
        elif stage == 3 and user_response:
                print ("step3")
                user.append(user_response)
                prompt_for_questions = f"Generate 1 interview questions based on the following skills: {skills} not this question {questions_and_answers[-1][0]}. Please provide only the questions focused on the subject's bookish knowledge, with no headings or additional text."
                generated_question = generate_response(prompt_for_questions)
                questions_and_answers.append([generated_question,""])
                stage += 1
        elif stage == 4 and user_response:
                print ("step4")
                user.append(user_response)
                prompt_for_questions = f"Generate 1 interview questions based on the following skills: {skills} not this question {questions_and_answers[-1][0]}. Please provide only the questions focused on the subject's bookish knowledge, with no headings or additional text."
                generated_question = generate_response(prompt_for_questions)
                questions_and_answers.append([generated_question,""])
                stage += 1
        elif stage == 5 and user_response:
                print ("step5")
                user.append(user_response)
                prompt_for_questions = f"Generate 1 interview questions based on the following skills: {skills} not this question {questions_and_answers[-1][0]}. Please provide only the questions focused on the subject's bookish knowledge, with no headings or additional text."
                generated_question = generate_response(prompt_for_questions)
                questions_and_answers.append([generated_question,""])
                stage += 1

        elif stage == 6 and user_response:
            print("Stage 6: Adding user responses and updating stage...")
            user.append(user_response)
            print("user response1", user)
            print("q nd a", questions_and_answers)
            print("user response2", user)
            questions_and_answers = [q[0] for q in questions_and_answers]

            # Create DataFrame
            df = pd.DataFrame({
                'question': questions_and_answers,
                'user_response': user
            })
            # df = pd.DataFrame(questions_and_answers, columns=["Question", "User_Response"])
            df = df.drop(index=[0, 1])
            df= generate_response_for_df(df)
            df=calculate_similarity(df)
            df.to_csv("skills_interview_results.csv", index=False)
            print(f"DataFrame saved to CSV.")
            questions_and_answers.append(["Ok, got it... Thanks for taking the interview!! Have a great day!! ", ""])
            # print("Exitting")
            # sys.exit("Stopping the script")
            stage += 1

        # elif stage == 7:
        #     print("Stage 7: Saving skills-related Q&A to CSV...")
        #     df = pd.DataFrame(questions_and_answers, columns=["Question", "User Response"])
        #     df.to_csv("skills_interview_results.csv", index=False)
        #     print(f"DataFrame saved to CSV.")

        #     # Resetting the state
        #     questions_and_answers = []
        #     print("Exitting")
        #     sys.exit("Stopping the script")
        #     stage += 1

    # Return only the most recent question for display and the updated stage
    recent_question = questions_and_answers[-1][0] if questions_and_answers else ""
    print(f"Returning recent question: {recent_question}")
    return recent_question, stage, questions_and_answers,user


def upload_resume(file_path):
    """Upload and process the resume to extract key information."""
    print(f"Uploading resume file: {file_path}")
    resume_text = process_resume(file_path)
    name, skills, experience, education = extract_resume_info(resume_text)
    return name, skills, experience, education, resume_text

def interview_interface(name, skills, experience, user_response=None,stage=0, questions_and_answers=None, user= None):
    recent_question, new_stage, updated_questions_and_answers, updated_user = run_interview(name, skills, experience, user_response, questions_and_answers, user, stage)
    return recent_question, new_stage, updated_questions_and_answers, updated_user

def start_interview_from_file(file):
    """Extract resume info and automatically start the interview."""
    name, skills, experience, education, _ = upload_resume(file.name)
    return interview_interface(name, skills, experience, stage=0, questions_and_answers=[], user=[])

with gr.Blocks() as demo:
    with gr.Tabs():
        with gr.TabItem("Upload Resume"):
            resume_file = gr.File(label="Upload your resume (PDF or DOCX):")
            name_output = gr.Textbox(label="Name", interactive=False)
            skills_output = gr.Textbox(label="Skills", interactive=False)
            experience_output = gr.Textbox(label="Experience", interactive=False)
            education_output = gr.Textbox(label="Education", interactive=False)
            process_button = gr.Button("Process Resume")
            process_button.click(upload_resume, inputs=resume_file, outputs=[name_output, skills_output, experience_output, education_output])

        with gr.TabItem("Interview"):
            recent_question_output = gr.Textbox(label="Colloquium says", interactive=False)
            user_response_input = gr.Textbox(label="What do you think")
            submit_response_button = gr.Button("Answer now")

            # Initialize interview state and questions list
            stage = gr.State(value=0)
            questions_and_answers = gr.State(value=[])
            user = gr.State(value=[])
            # Automatically start interview when the resume is processed
            resume_file.change(start_interview_from_file, inputs=resume_file, outputs=[recent_question_output, stage, questions_and_answers, user])

            # Capture user response, generate model response, and update stage
            submit_response_button.click(
                interview_interface,
                inputs=[name_output, skills_output, experience_output, user_response_input, stage, questions_and_answers, user],
                outputs=[recent_question_output, stage, questions_and_answers, user]
            )

demo.launch(debug ='True', share='True')


2024-09-08 22:05:18.260437: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-08 22:05:18.273089: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-08 22:05:18.276911: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-08 22:05:18.286459: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://e38af6ca49e6b18dea.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Uploading resume file: /tmp/gradio/19dda12e8ee13f1ac7412d03d23e363940269eb2/resume.pdf
Processing resume file: /tmp/gradio/19dda12e8ee13f1ac7412d03d23e363940269eb2/resume.pdf
Extracted Text: Kumar Pallav
   
EDUCATION
Indian Institute of Technology, Bombay
Mumbai, India
Bachelor of Computer Science and Engineering (with Hons.); Jun 2010 - May
2014
CGPA 8.6/10
EXPERIENCE
Microsoft
Redm...
Extracting resume information...
Extracted Name: kumar pallav
Extracted SKILLS: languages
c++ c\# javascript java c
technologies
nodejs uwp win32
 
Extracted EXPERIENCE: microsoft
redmond, wa
software engineer, onenote jun 2016 - present
modern hierarchy sync c++ nodejs
worked on a new sync platform which would load notebook structure
and hierarchy faster than the current state of the art. implemented client
side representation of this hierarchy, derived from existing
representation. added functionality to be able use the new
representation without the need of resetting cache or reloading the
not

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


Stage: 1
User Response: dealing with missing values
[['Can you describe a challenging project where you applied your experience?', '']]
User []
step: 1
['dealing with missing values']
Generating response for prompt: You are a job interviewer. Please generate only 1 follow-up question starting with 'Question:' WITHOUT any additional text or answer...just the question.. based on this: dealing with missing values
Generated Text: You are a job interviewer. Please generate only 1 follow-up question starting with 'Question:' WITHOUT any additional text or answer...just the question.. based on this: dealing with missing values in dataset

Question: How do you typically handle missing values in a dataset during your analysis process?


### Answer:
What method do you prefer for imputing missing data in your datasets, and can you explain the rationale behind your choice?
Stage 1: Added follow-up question: Question: How do you typically handle missing values in a dataset during your analysis proc

ERROR:    Traceback (most recent call last):
  File "/root/miniconda3/envs/abhishek/lib/python3.11/asyncio/runners.py", line 190, in run
    return runner.run(main)
           ^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/abhishek/lib/python3.11/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/miniconda3/envs/abhishek/lib/python3.11/asyncio/base_events.py", line 641, in run_until_complete
    self.run_forever()
  File "/root/miniconda3/envs/abhishek/lib/python3.11/asyncio/base_events.py", line 608, in run_forever
    self._run_once()
  File "/root/miniconda3/envs/abhishek/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once
    handle._run()
  File "/root/miniconda3/envs/abhishek/lib/python3.11/asyncio/events.py", line 84, in _run
    self._context.run(self._callback, *self._args)
  File "/home/sayantan/.local/lib/python3.11/site-packages/gradio/queueing.py", line 541, in proces

Model Response: 2. How do you handle exceptions and error handling when using async/await in C++, C#, and Java?

# Answer

When using async/await in C++, C#, and Java, handling exceptions and errors is critical to ensure robustness and proper error handling in asynchronous operations. Here's how you can handle exceptions and errors in each of these languages:

### C++

C++ doesn't have built-in support for async/await like C# or Java, but you can achieve similar behavior using libraries like
Similarity score for row 2: 35.84%
Similarity score for row 3: 24.61%
Similarity score for row 4: 23.15%
Similarity score for row 5: 37.85%
DataFrame saved to CSV.
Exitting
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://e38af6ca49e6b18dea.gradio.live


