In [None]:
!pip install sentence-transformers
!pip install mlflow
!pip install openai==0.28
!pip install fastapi
!pip install streamlit
!pip install pyngrok
!pip install transformers




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
import spacy
from sentence_transformers import SentenceTransformer, util

# Load spaCy model for Named Entity Recognition (NER)
nlp = spacy.load('en_core_web_sm')

# Load SentenceTransformer for semantic similarity
semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to preprocess text (tokenization, stopword removal, etc.)
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

# Function to calculate semantic similarity
def calculate_similarity(resume_text, jd_text):
    resume_embedding = semantic_model.encode(resume_text, convert_to_tensor=True)
    jd_embedding = semantic_model.encode(jd_text, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(resume_embedding, jd_embedding).item()
    return similarity

# Preprocess and calculate similarity
processed_resume = preprocess_text(resume)
processed_jd = preprocess_text(job_description)
similarity_score = calculate_similarity(processed_resume, processed_jd)

print(f"Semantic Similarity Score: {similarity_score}")


Semantic Similarity Score: 0.10290748625993729


In [None]:
import joblib
# Additional features for resume ranking
experience_years = 5  # Assume this is extracted from the resume
skills_match = 1      # Assume skills match is binary: 1 (match), 0 (no match)

# Prepare the dataset for training Random Forest
data =pd.read_csv("/content/large_mock_resume_data.csv")
df = pd.DataFrame(data)

# Split dataset into train and test sets
X = df[['similarity', 'experience_years', 'skills_match']]
y = df['hired']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train)
# Train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy}")

# Prediction for the current resume (using the example features)
new_resume_features = pd.DataFrame({
    'similarity': [similarity_score],
    'experience_years': [experience_years],
    'skills_match': [skills_match]
})
print(new_resume_features)
predicted_hire = clf.predict(new_resume_features)
print(f"Resume Hire Prediction: {'Hired' if predicted_hire[0] == 1 else 'Not Hired'}")

joblib.dump(clf, 'resume_ranking_model.pkl')  # Save the model

     similarity  experience_years  skills_match
29     0.523225                18             1
535    0.972383                17             1
695    0.805310                12             1
557    0.514391                 9             0
836    0.957423                18             0
..          ...               ...           ...
106    0.705191                13             1
270    0.904681                19             1
860    0.887764                14             1
435    0.850485                16             1
102    0.657178                 7             1

[800 rows x 3 columns]
Model Accuracy: 0.51
   similarity  experience_years  skills_match
0    0.102907                 5             1
Resume Hire Prediction: Not Hired


['resume_ranking_model.pkl']

In [None]:
import mlflow
import mlflow.sklearn

# Start MLflow run
with mlflow.start_run():
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Log metrics and model
    mlflow.log_metric("accuracy", accuracy)
    mlflow.sklearn.log_model(clf, "resume_ranking_model")

    print(f"Logged Model Accuracy: {accuracy}")




Logged Model Accuracy: 0.51


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load pre-trained T5 model and tokenizer from Hugging Face
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Function to generate feedback using T5
def generate_feedback(resume_text, jd_text):
    # Create input for the model
    input_text = f"compare: {resume_text} to job: {jd_text}"

    # Tokenize input text
    input_ids = tokenizer(input_text, return_tensors='pt').input_ids

    # Generate feedback
    feedback_ids = model.generate(input_ids, max_length=50)
    feedback = tokenizer.decode(feedback_ids[0], skip_special_tokens=True)

    return feedback


feedback = generate_feedback(resume, job_description)
print(f"Feedback: {feedback}")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Feedback: : resume2.txt.pdf to: resume2.txt.pdf to job: Looking for a data scientist proficient in Python and deep learning.


In [None]:
import os

def rank_resumes(jd_text, resume_files):
    rankings = []
    for resume_file in resume_files:
        resume_file_path = os.path.join("/content/resume1.txt", resume_file)
        with open(resume_file, 'r',encoding='latin-1') as file:
            resume_text = file.read()
            processed_resume = preprocess_text(resume_text)
            score = calculate_similarity(processed_resume, jd_text)
            rankings.append((resume_file, score))

    # Sort resumes by score
    rankings.sort(key=lambda x: x[1], reverse=True)
    return rankings

# Example Input/Output
job_description = "Looking for a data scientist proficient in Python and deep learning."
resume_files = ['resume1.txt.pdf', 'resume2.txt.pdf', 'resume3.txt.pdf']
ranked_resumes = rank_resumes(job_description, resume_files)

for resume, score in ranked_resumes:
    print(f"Resume: {resume}, Score: {score}")


Resume: resume3.txt.pdf, Score: 0.1084691733121872
Resume: resume1.txt.pdf, Score: 0.05133873224258423
Resume: resume2.txt.pdf, Score: 0.030292704701423645


In [None]:
import torch

# as 'model' is T5ForConditionalGeneration object
torch.save(model.state_dict(), 'resume_ranking_model.pth')
# Save the model's state dictionary to a .pth file


In [None]:
!pip install ngrok
!pip install PyMuPDF  # For PDF processing



In [None]:
!ngrok config add-authtoken 2nQ3pFBFq91lamPkHw4T36dfCKO_6GK37BkB4QxSqLBqUUPvq

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from pyngrok import ngrok
# Terminate any existing tunnels
ngrok.kill()

# Start a new tunnel for port 8501 (Streamlit's default port)
# Use 'addr' instead of 'port' for specifying the address
public_url = ngrok.connect(addr='8501')
print(f"Streamlit public URL: {public_url}")

Streamlit public URL: NgrokTunnel: "https://50f3-34-142-255-107.ngrok-free.app" -> "http://localhost:8501"


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import joblib
from transformers import T5ForConditionalGeneration, T5Tokenizer
import fitz  # PyMuPDF for PDF processing

# Load the pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Load the pre-trained Random Forest model
clf = joblib.load('resume_ranking_model.pkl')  # Adjust the path as necessary

def predict_score(resume_features):
    """Predict the hiring score based on resume features."""
    # Get the prediction probability for the 'hired' class
    score = clf.predict_proba([resume_features])[0][1]  # Get probability of being hired
    return score

def explain_ranking(resume_features):
    """Provide insights into ranking based on features."""
    explanation = f"Similarity: {resume_features[0]}, Experience: {resume_features[1]} years, Skills Match: {resume_features[2]}"
    return explanation

def extract_text_from_pdf(pdf_file):
    """Extract text from a PDF file."""
    text = ""
    with fitz.open(pdf_file) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Streamlit App
st.title('AI-Powered Resume Ranking System')

# Upload Job Description
uploaded_jd = st.file_uploader("Upload Job Description (txt file)", type=["txt"])
uploaded_cvs = st.file_uploader("Upload CVs (pdf file)", type=["pdf"], accept_multiple_files=True)

if uploaded_jd and uploaded_cvs:
    # Read the job description
    jd_text = uploaded_jd.read().decode("utf-8")

    results = []

    # Process each PDF CV
    for uploaded_cv in uploaded_cvs:
        cv_text = extract_text_from_pdf(uploaded_cv)

        # we can further process cv_text to extract specific information, e.g., experience, skills
        # For this example, we'll create mock values for demonstration
        similarity = np.random.uniform(0.5, 1.0)  # Replace with actual similarity calculation
        experience_years = 5  # Replace with actual extracted value
        skills_match = 1  # Replace with actual extracted value

        # Create feature set
        resume_features = [similarity, experience_years, skills_match]

        # Predict the score
        score = predict_score(resume_features)

        # Generate feedback using T5
        input_text = f"Generate feedback for the resume with similarity {similarity}, experience {experience_years}, skills match {skills_match}."
        input_ids = tokenizer.encode(input_text, return_tensors='pt')
        outputs = model.generate(input_ids)
        feedback = tokenizer.decode(outputs[0], skip_special_tokens=True)

        results.append((uploaded_cv.name, score))

    # Display results
    results_df = pd.DataFrame(results, columns=["CV File Name", "Score"])
    st.write("Ranking Results:")
    st.dataframe(results_df)

    # Allow download of results
    st.write("Download Ranking Results:")
    st.download_button("Download CSV", results_df.to_csv(index=False), "ranked_resumes.csv")


Overwriting app.py


In [None]:
!streamlit run app.py & npx localtunnel --port 8501



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.142.255.107:8501[0m
[0m
your url is: https://real-parks-rule.loca.lt
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
/root/.npm/_npx/75ac80b86e83d4a2/node_modules/localtunnel/bin/lt.js:81
    throw err;
    ^

Error: connection refused: localtunnel.me:32209 (check your 