In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


##Preprocess Function

In [None]:
import re
import spacy

# spacy NLP model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    text = re.sub(r'\n|\r', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text


In [None]:
def extract_skills(text, skills_list):
    tokens = text.split()
    found_skills = set()
    for skill in skills_list:
        if skill in text:
            found_skills.add(skill)
    return list(found_skills)



##Data

In [None]:
job_df=pd.read_csv('/content/job_title_des.csv')

In [None]:
job_df

Unnamed: 0.1,Unnamed: 0,Job Title,Job Description
0,0,Flutter Developer,We are looking for hire experts flutter develo...
1,1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...
2,2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n..."
3,3,iOS Developer,JOB DESCRIPTION:\n\nStrong framework outside o...
4,4,Full Stack Developer,job responsibility full stack engineer – react...
...,...,...,...
2272,2399,Backend Developer,Job Summary\nPublished on : 26 days ago\nVacan...
2273,2400,Full Stack Developer,business entity cisco umbrella focus cloud-bas...
2274,2401,Network Administrator,Urgently reqd in a college in Mohali\nNetwork ...
2275,2402,Machine Learning,Key Responsibilities: Team leads for small or ...


In [None]:
job_df['Job Title'].value_counts()

Unnamed: 0_level_0,count
Job Title,Unnamed: 1_level_1
JavaScript Developer,166
Java Developer,161
Software Engineer,160
Node js developer,160
iOS Developer,159
PHP Developer,156
Flutter Developer,155
DevOps Engineer,155
Django Developer,152
Machine Learning,152


In [None]:
res_df=pd.read_csv('/content/final_merged_dataset2.csv')
res_df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
Python_Developer,1278
Java_Developer,1242
Web_Developer,984
Database_Administrator,920
Security_Analyst,876
Systems_Administrator,776
Project_manager,722
Front_End_Developer,530
Network_Administrator,469
Software_Developer,437


In [None]:
res_df

Unnamed: 0,Resume,Category
0,Python Developer Python Developer Philadelphia...,Python_Developer
1,Python Developer Python Developer Python Devel...,Python_Developer
2,R&D Engineer R&D Engineer R&D Engineer - Nokia...,Python_Developer
3,Sr. Full Stack Developer Sr. Full Stack Develo...,Python_Developer
4,Sr. Full Stack Python Developer Sr. Full Stack...,Python_Developer
...,...,...
8229,Oracle database administrator Oracle database ...,Database_Administrator
8230,"Job Seeker Columbus, OH I am responsible and d...",Database_Administrator
8231,Database Administrator Database Administrator ...,Database_Administrator
8232,Business Intelligence Engineer Business Intell...,Database_Administrator


In [None]:

job_df['cleaned_description'] = job_df['Job Description'].apply(preprocess_text)

In [None]:
res_df['cleaned_resume']=res_df['Resume'].apply(preprocess_text)

##Skills extraction

In [None]:
def extract_skills_spacy(text):
    text = preprocess_text(text)
    doc = nlp(text)

    skill_keywords = ['experience', 'proficient', 'knowledge', 'skilled', 'expertise', 'familiar', 'working with']

    skill_phrases = set()

    for sent in doc.sents:
        if any(keyword in sent.text.lower() for keyword in skill_keywords):
            for chunk in sent.noun_chunks:
                chunk_text = chunk.text.strip().lower()
                if len(chunk_text.split()) <= 4 and not chunk_text.isdigit():
                    skill_phrases.add(chunk_text)

    return sorted(skill_phrases)


In [None]:
skills_list = [
    # Technical Skills
    'sql', 'r', 'python programming','statistical procedures', 'statistical analysis','tensorflow', 'pytorch', 'scikit-learn', 'machine learning algorithms','tableau', 'power bi', 'matplotlib', 'data visualization','data wrangling', 'data transformation', 'data cleaning','big data', 'spark', 'hadoop','mongodb', 'postgresql', 'mysql', 'database management systems','cloud computing', 'google cloud', 'azure', 'amazon web services','spss', 'sas', 'software statistics','calculus', 'probability', 'linear algebra','information extraction', 'data mining','nlp', 'text analysis', 'analysis of textual data','neural networks', 'deep learning','time management','git', 'version control','analytical thinking', 'problem solving','programming languages', 'data structures and algorithms','database management', 'software development lifecycle','operating systems', 'networking', 'cybersecurity','ai', 'data analysis', 'web development','mobile application development', 'technical writing','communication', 'presenting research results','project management', 'ethics', 'data privacy','teamwork', 'collaboration', 'cooperation','continuous learning', 'lifelong learning','adaptability', 'creativity', 'attention to detail','leadership'
]


In [None]:

job_df['extracted_skills_spacy'] = job_df['cleaned_description'].apply(extract_skills_spacy)


In [None]:
job_df['extracted_skills'] = job_df['cleaned_description'].apply(lambda x: extract_skills(x, skills_list))

In [None]:
res_df['res_extracted_skills']=res_df['cleaned_resume'].apply(lambda x: extract_skills(x, skills_list))

##Testing

In [None]:
job_df['extracted_skills'][145]

['r', 'sas', 'ai', 'communication']

In [None]:
res_df['res_extracted_skills'][2495]

['operating systems', 'r', 'ai', 'sql', 'spss', 'communication']

In [None]:
job_df['Job Title'].unique()

array(['Flutter Developer', 'Django Developer', 'Machine Learning',
       'iOS Developer', 'Full Stack Developer', 'Java Developer',
       'JavaScript Developer', 'DevOps Engineer', 'Software Engineer',
       'Database Administrator', 'Wordpress Developer', 'PHP Developer',
       'Backend Developer', 'Network Administrator', 'Node js developer'],
      dtype=object)

In [None]:
res_df['Category'].unique()

array(['Python_Developer', 'Java_Developer', 'Front_End_Developer',
       'Network_Administrator', 'Project_manager', 'Security_Analyst',
       'Software_Developer', 'Systems_Administrator', 'Web_Developer',
       'Database_Administrator'], dtype=object)

In [None]:
res_df['Category'] = res_df['Category'].str.lower().str.strip()
job_df['Job Title'] = job_df['Job Title'].str.lower().str.strip()

In [None]:
role_mapping = {
    'python_developer': 'django developer',
    'java_developer': 'java developer',
    'front_end_developer': 'javascript developer',
    'network_administrator': 'network administrator',
    'project_manager': 'devops engineer',  # Best approximate match
    'security_analyst': 'devops engineer',  # Approximate match
    'software_developer': 'software engineer',
    'systems_administrator': 'devops engineer',  # Approximate match
    'web_developer': 'full stack developer',
    'database_administrator': 'database administrator'
}

In [None]:
res_df['role'] = res_df['Category'].map(role_mapping)

In [None]:
res_df['role'].value_counts()

Unnamed: 0_level_0,count
role,Unnamed: 1_level_1
devops engineer,2374
django developer,1278
java developer,1242
full stack developer,984
database administrator,920
javascript developer,530
network administrator,469
software engineer,437


In [None]:
job_df.drop(columns=['extracted_skills_spacy'],inplace=True)
job_df

Unnamed: 0.1,Unnamed: 0,Job Title,Job Description,cleaned_description,extracted_skills
0,0,flutter developer,We are looking for hire experts flutter develo...,we are looking for hire experts flutter develo...,[r]
1,1,django developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...,pythondjango developerlead job codepdj 04 st...,"[r, ai, sql, communication, attention to detail]"
2,2,machine learning,"Data Scientist (Contractor)\n\nBangalore, IN\n...",data scientist contractor bangalore in respo...,"[pytorch, big data, r, tensorflow, ai, deep le..."
3,3,ios developer,JOB DESCRIPTION:\n\nStrong framework outside o...,job description strong framework outside of i...,"[r, networking, ai]"
4,4,full stack developer,job responsibility full stack engineer – react...,job responsibility full stack engineer react ...,"[networking, r, git, ai, sas, communication, w..."
...,...,...,...,...,...
2272,2399,backend developer,Job Summary\nPublished on : 26 days ago\nVacan...,job summary published on 26 days ago vacancy ...,"[mysql, r, ai, sql, communication, postgresql,..."
2273,2400,full stack developer,business entity cisco umbrella focus cloud-bas...,business entity cisco umbrella focus cloudbase...,"[r, communication, ai]"
2274,2401,network administrator,Urgently reqd in a college in Mohali\nNetwork ...,urgently reqd in a college in mohali network a...,"[r, networking]"
2275,2402,machine learning,Key Responsibilities: Team leads for small or ...,key responsibilities team leads for small or m...,"[r, git, ai, problem solving]"


##Merging the data

In [None]:
unique_roles = job_df['Job Title'].unique()


In [None]:
job_df_dict = {}
resume_df_dict = {}

for role in unique_roles:
    role_key = role.lower().strip().replace(" ", "_")  # normalize the key
    job_df_dict[role_key] = job_df[job_df['Job Title'].str.lower() == role.lower()]
    resume_df_dict[role_key] = res_df[res_df['role'].str.lower() == role.lower()]


In [None]:
res_df['role'].value_counts()


Unnamed: 0_level_0,count
role,Unnamed: 1_level_1
devops engineer,2374
django developer,1278
java developer,1242
full stack developer,984
database administrator,920
javascript developer,530
network administrator,469
software engineer,437


In [None]:
# job_df_dict['java_developer']
resume_df_dict['full_stack_developer']
# res_df

Unnamed: 0,Resume,Category,cleaned_resume,res_extracted_skills,role
6330,Email Campaign Developer Email Campaign Develo...,web_developer,email campaign developer email campaign develo...,"[r, mysql, ai, sql]",full stack developer
6331,Front End Web Developer Front End Web Develope...,web_developer,front end web developer front end web develope...,"[mongodb, r, git, sql, ai, mysql, web developm...",full stack developer
6332,Full-Stack Web Developer Full-Stack Web Develo...,web_developer,fullstack web developer fullstack web develope...,"[mysql, mongodb, r, git, sql, ai, sas, postgre...",full stack developer
6333,Senior Full Stack Web Developer Senior Full St...,web_developer,senior full stack web developer senior full st...,"[r, git, ai, sql, communication, mysql, data a...",full stack developer
6334,Contractor Contractor Contractor - FUJIFILM Ho...,web_developer,contractor contractor contractor fujifilm hol...,"[r, mysql, ai, sql]",full stack developer
...,...,...,...,...,...
7309,Web Developer Web Developer Web- Developer Ced...,web_developer,web developer web developer web developer ceda...,"[mongodb, r, git, sql, ai, sas]",full stack developer
7310,Web Developer Web Developer Web Developer and ...,web_developer,web developer web developer web developer and ...,"[networking, r, git, sql, communication, mysql...",full stack developer
7311,WEB DEVELOPER WEB DEVELOPER WEB DEVELOPER Rale...,web_developer,web developer web developer web developer rale...,"[leadership, r, git, ai, programming languages...",full stack developer
7312,Freelance Web Developer / Teacher Freelance We...,web_developer,freelance web developer teacher freelance web...,"[mongodb, r, git, sql, sas, data structures an...",full stack developer


In [None]:
final_role_df_dict = {}

# Get unique roles from resume_df
unique_roles = res_df['role'].unique()

for role in unique_roles:
    role_key = role.lower().strip().replace(" ", "_")

    # Get resumes and skills for the role
    resume_subset = res_df[res_df['role'].str.lower() == role.lower()][['role', 'cleaned_resume', 'res_extracted_skills']].copy()
    resumes = resume_subset['cleaned_resume'].tolist()
    skills = resume_subset['res_extracted_skills'].tolist()

    # Get job descriptions for the role
    jd_subset = job_df[job_df['Job Title'].str.lower() == role.lower()][['cleaned_description']].copy()
    job_descriptions = jd_subset['cleaned_description'].tolist()

    if not job_descriptions:
        continue

    # Repeat JDs to match resume count
    repeated_jds = (job_descriptions * ((len(resumes) // len(job_descriptions)) + 1))[:len(resumes)]

    # Create final DataFrame
    combined_df = pd.DataFrame({
        'role': [role] * len(resumes),
        'job_description': repeated_jds,
        'resume': resumes,
        'skills': skills
    })

    final_role_df_dict[role_key] = combined_df


In [None]:
final_role_df_dict['java_developer']


Unnamed: 0,role,job_description,resume,skills
0,java developer,software developer integration immediate open...,front end software trainee front end software ...,"[r, problem solving, ai, sql]"
1,java developer,overview western national is seeking a highly...,front end software trainee front end software ...,"[r, problem solving, ai, sql]"
2,java developer,software developer who we are the american ph...,java developer java developer java developer ...,"[operating systems, r, git, sql, ai, programmi..."
3,java developer,we are seeking talented and creative individua...,java developer java developer java developer m...,"[problem solving, networking, r, git, sql, ai,..."
4,java developer,java developer summary description _temporari...,c java android trainer cum developer c java an...,"[r, networking, ai]"
...,...,...,...,...
1237,java developer,providence st joseph health is calling a softw...,java microservices developer java microservice...,"[mysql, amazon web services, project managemen..."
1238,java developer,about us as the global leader in agile market...,job seeker over 5 years of professional exper...,"[mongodb, time management, operating systems, ..."
1239,java developer,jpmorgan chase co one of the oldest financial...,full stack java developer full stack java deve...,"[hadoop, r, ai, sql]"
1240,java developer,techvolt software offers java development inte...,java developer angular developer java develop...,"[r, mysql, ai, sql]"


##Preparing the training data

In [None]:
import random
import pandas as pd

def generate_questions_from_skills(skills, role):
    if not isinstance(skills, list):
        return "Tell me about your experience relevant to this role."

    questions = []

    for skill in skills:
        skill = skill.strip()
        questions.append(f"How have you used {skill} in your projects?")
        questions.append(f"What challenges did you face while working with {skill}?")

    questions.append(f"What makes you a good fit for the {role} role?")

    selected = random.sample(questions, min(3, len(questions)))
    return "\n".join(selected)


In [None]:

all_data = []
t=1
for role_key, df in final_role_df_dict.items():
    # Prepare input_text for each role-based dataframe
    df['input_text'] = df.apply(
        lambda row: f"Role: {row['role']}\nSkills: {', '.join(row['skills']) if isinstance(row['skills'], list) else row['skills']}\nResume: {row['resume']}\nJD: {row['job_description']}",
        axis=1
    )
    t=t+1
    print(t)

    # Generate dynamic interview questions based on skills and role
    df['target_text'] = df.apply(
        lambda row: generate_questions_from_resume(row['resume'],row['skills'],1),
        # lambda row: generate_questions_from_skills(row['skills'], row['role']),
        axis=1
    )

    # Append this dataframe to the list

    all_data.append(df[['input_text', 'target_text']])

# Combine all role-based dataframes into one training DataFrame
train_df = pd.concat(all_data, ignore_index=True)

train_df.head()


2


KeyboardInterrupt: 

In [None]:
# Save to a specific path in your Drive
df.to_csv('/content/drive/MyDrive/interview_question_data.csv', index=False)


In [None]:
train_df.to_csv('train_data.csv', index=False)  # Replace df with your DataFrame


##Build a Model

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenizers
input_tokenizer = Tokenizer(oov_token="<OOV>")
output_tokenizer = Tokenizer(oov_token="<OOV>")

input_tokenizer.fit_on_texts(train_df['input_text'])
output_tokenizer.fit_on_texts(train_df['target_text'])

# Sequences
input_seq = input_tokenizer.texts_to_sequences(train_df['input_text'])
output_seq = output_tokenizer.texts_to_sequences(train_df['target_text'])

# Padding
max_input_len = max(len(seq) for seq in input_seq)
max_output_len = max(len(seq) for seq in output_seq)

input_seq = pad_sequences(input_seq, maxlen=max_input_len, padding='post')
output_seq = pad_sequences(output_seq, maxlen=max_output_len, padding='post')


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Input tokenizer
input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(train_df['input_text'])
input_sequences = input_tokenizer.texts_to_sequences(train_df['input_text'])
input_padded = pad_sequences(input_sequences, padding='post')
max_input_len = input_padded.shape[1]

# Output tokenizer
output_tokenizer = Tokenizer()
output_tokenizer.fit_on_texts(train_df['target_text'])
output_sequences = output_tokenizer.texts_to_sequences(train_df['target_text'])
output_padded = pad_sequences(output_sequences, padding='post')
max_output_len = output_padded.shape[1]

vocab_size_in = len(input_tokenizer.word_index) + 1
vocab_size_out = len(output_tokenizer.word_index) + 1


In [None]:
import numpy as np
from tensorflow.keras.utils import to_categorical

decoder_input_data = output_padded[:, :-1]
decoder_target_data = output_padded[:, 1:]
decoder_target_one_hot = to_categorical(decoder_target_data, num_classes=vocab_size_out)


In [None]:
# import tensorflow as tf
# from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Attention
# from tensorflow.keras.models import Model

# from keras.layers import Concatenate, Lambda



# # Define parameters
# vocab_size_in = len(input_tokenizer.word_index) + 1
# vocab_size_out = len(output_tokenizer.word_index) + 1
# embedding_dim = 256
# lstm_units = 512

# # Encoder
# encoder_inputs = Input(shape=(max_input_len,))
# enc_emb = Embedding(vocab_size_in, embedding_dim)(encoder_inputs)
# encoder_lstm = LSTM(lstm_units, return_state=True)
# encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# # Decoder
# decoder_inputs = Input(shape=(max_output_len,))
# dec_emb = Embedding(vocab_size_out, embedding_dim)(decoder_inputs)
# decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
# decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# encoder_outputs_expanded = Lambda(lambda x: tf.expand_dims(x, 1))(encoder_outputs)

# # Apply attention mechanism
# attention_layer = Attention()
# context_vector = attention_layer([decoder_outputs, encoder_outputs_expanded])

# # Concatenate decoder output with attention context
# concat = Concatenate(axis=-1)([decoder_outputs, context_vector])

# # Attention
# # attention = Attention()
# # context_vector = attention([decoder_outputs, tf.expand_dims(encoder_outputs, 1)])
# # concat = tf.concat([decoder_outputs, context_vector], axis=-1)

# # Output layer
# output = Dense(vocab_size_out, activation='softmax')(concat)


In [None]:
# model = Model([encoder_inputs, decoder_inputs], output)
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
# model.summary()

In [None]:
# import numpy as np

# # === Prepare decoder input and target sequences ===
# seq_len = output_seq.shape[1]

# decoder_input_data = output_seq[:, :-1]
# decoder_target_data = output_seq[:, 1:]

# # === Train the model ===
# model.fit(
#     [input_seq, decoder_input_data],
#     tf.keras.utils.to_categorical(decoder_target_data, num_classes=vocab_size_out),  # one-hot encode
#     batch_size=32,
#     epochs=20,
#     validation_split=0.2
# )


In [None]:
# print("input_seq shape:", input_seq.shape)
# print("decoder_input_data shape:", decoder_input_data.shape)
# print("decoder_target_data shape:", decoder_target_data.shape)


In [None]:
# # decoder_inputs = Input(shape=(32,))

# # decoder_input_data = output_seq[:, :-1]
# # decoder_target_data = output_seq[:, 1:]


# # Original output_seq.shape = (batch_size, seq_len)
# seq_len = output_seq.shape[1]

# # decoder_input_data = output_seq[:, :-1]   # (batch_size, seq_len-1)
# decoder_input_data = np.pad(decoder_input_data, ((0, 0), (0, 1)), mode='constant')
# decoder_target_data = output_seq[:, 1:]   # (batch_size, seq_len-1)

# # Then define input layers like this:
# decoder_inputs = Input(shape=(seq_len - 1,))

# model.fit([input_seq, decoder_input_data],
#           np.expand_dims(decoder_target_data, -1),
#           batch_size=32,
#           epochs=20,
#           validation_split=0.2)


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Attention, Concatenate, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical


In [None]:

# Sequence Parameters
vocab_size_in = len(input_tokenizer.word_index) + 1
vocab_size_out = len(output_tokenizer.word_index) + 1
embedding_dim = 256
lstm_units = 512

max_input_len = input_seq.shape[1]
full_seq_len = output_seq.shape[1]

decoder_input_data = output_seq[:, :-1]
decoder_target_data = output_seq[:, 1:]
max_output_len = decoder_input_data.shape[1]

# Encoder
encoder_inputs = Input(shape=(max_input_len,))
enc_emb = Embedding(input_dim=vocab_size_in, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# Decoder
decoder_inputs = Input(shape=(max_output_len,))
dec_emb = Embedding(input_dim=vocab_size_out, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# Attention
# encoder_outputs_expanded = Lambda(lambda x: tf.expand_dims(x, 1))(encoder_outputs)

def expand_dims_fn(x):
    return tf.expand_dims(x, axis=1)

encoder_outputs_expanded = Lambda(
    expand_dims_fn,
    output_shape=(1, lstm_units),
    name="expand_encoder"
)(encoder_outputs)
attention_layer = Attention()
context_vector = attention_layer([decoder_outputs, encoder_outputs_expanded])

# Concatenate attention context with decoder output
concat = Concatenate(axis=-1)([decoder_outputs, context_vector])

# Output Layer
output = Dense(vocab_size_out, activation='softmax')(concat)


In [None]:

# Compile Model
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:

# One-hot encode target data
decoder_target_one_hot = to_categorical(decoder_target_data, num_classes=vocab_size_out)

# Training the model
model.fit(
    [input_seq, decoder_input_data],
    decoder_target_one_hot,
    batch_size=32,
    epochs=4,
    validation_split=0.2
)


Epoch 1/4




[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 983ms/step - accuracy: 0.6334 - loss: 1.6196 - val_accuracy: 0.8771 - val_loss: 0.4221
Epoch 2/4
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 1s/step - accuracy: 0.8837 - loss: 0.3625 - val_accuracy: 0.8804 - val_loss: 0.4099
Epoch 3/4
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 1s/step - accuracy: 0.8867 - loss: 0.3482 - val_accuracy: 0.8799 - val_loss: 0.4071
Epoch 4/4
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 1s/step - accuracy: 0.8874 - loss: 0.3427 - val_accuracy: 0.8785 - val_loss: 0.4139


<keras.src.callbacks.history.History at 0x796a73358b10>

In [None]:
save_path = '/content/drive/MyDrive/interview_model.keras'


In [None]:
model.save(save_path)


##Load the model

In [None]:
from keras.models import load_model

# Re-define this function since it's used inside the model (important for deserialization)
def expand_dims_fn(x):
    return tf.expand_dims(x, axis=1)

# Re-mount Drive and load
drive.mount('/content/drive')

# Load the model using the same function
loaded_model = load_model('/content/drive/shared-with-me/interview_model.keras',
                          custom_objects={'expand_dims_fn': expand_dims_fn})


In [None]:
loaded_model.summary()

##Model testing

In [None]:
def preprocess_input(text, tokenizer, max_len):
    sequence = tokenizer.texts_to_sequences([text])
    padded = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_len, padding='post')
    return padded


In [None]:
def generate_question(model, resume_text, input_tokenizer, output_tokenizer, max_input_len,skills_list):
    reverse_output_tokenizer = {v: k for k, v in output_tokenizer.word_index.items()}
    reverse_output_tokenizer[0] = ''

    # 1. Extract skills
    skills = extract_skills(resume_text,skills_list)
    if not skills:
        skills = ["general technical skills"]  # fallback if nothing is found

    # 2. Create a new prompt focused on skills
    skill_prompt = "generate interview questions for these skills: " + ", ".join(skills)

    # 3. Preprocess input
    input_seq = preprocess_input(skill_prompt, input_tokenizer, max_input_len)

    # Decoder init
    start_token = output_tokenizer.word_index.get('startseq', 1)
    end_token = output_tokenizer.word_index.get('endseq', 2)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token

    decoded_sentence = []

    for _ in range(50):  # Max question length
        output_tokens = model.predict([input_seq, target_seq], verbose=0)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_output_tokenizer.get(sampled_token_index, '')

        if sampled_word == 'endseq' or sampled_word == '':
            break

        decoded_sentence.append(sampled_word)
        target_seq = np.append(target_seq, [[sampled_token_index]], axis=1)

    return ' '.join(decoded_sentence)


In [None]:
def generate_question(resume_text, max_len=30):
    resume_text = preprocess_text(resume_text)
    seq = input_tokenizer.texts_to_sequences([resume_text])
    seq = pad_sequences(seq, maxlen=max_input_len, padding='post')

    decoder_seq = np.zeros((1, max_output_len - 1))
    decoder_seq[0, 0] = output_tokenizer.word_index['<start>']

    for i in range(1, max_output_len - 1):
        output_tokens = model.predict([seq, decoder_seq])
        sampled_token_index = np.argmax(output_tokens[0, i-1, :])
        if sampled_token_index == output_tokenizer.word_index.get('<end>', -1):
            break
        decoder_seq[0, i] = sampled_token_index

    output_text = output_tokenizer.sequences_to_texts(decoder_seq.astype(int))[0]
    return output_text.replace('<start>', '').replace('<end>', '').strip()

# Example:
print(generate_question("Skilled in Python, TensorFlow, data wrangling, and project management"))


In [None]:
# def generate_question(model, resume_text, input_tokenizer, output_tokenizer, max_input_len):
#     reverse_output_tokenizer = {v: k for k, v in output_tokenizer.word_index.items()}
#     reverse_output_tokenizer[0] = ''

#     # Preprocess input
#     input_seq = preprocess_input(resume_text, input_tokenizer, max_input_len)

#     # Start token for decoder
#     start_token = output_tokenizer.word_index.get('startseq', 1)
#     end_token = output_tokenizer.word_index.get('endseq', 2)

#     target_seq = np.zeros((1, 1))
#     target_seq[0, 0] = start_token

#     decoded_sentence = []

#     for _ in range(50):  # Max question length
#         output_tokens = model.predict([input_seq, target_seq], verbose=0)
#         sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         sampled_word = reverse_output_tokenizer.get(sampled_token_index, '')

#         if sampled_word == 'endseq' or sampled_word == '':
#             break

#         decoded_sentence.append(sampled_word)

#         # Update target_seq (append this word)
#         target_seq = np.append(target_seq, [[sampled_token_index]], axis=1)

#     return ' '.join(decoded_sentence)


In [None]:
resume_text = """
Skilled in Python, TensorFlow, and machine learning. Experienced with data analysis,
model training, and building deep learning pipelines. Proficient in SQL and data wrangling.
"""

question = generate_question(
    loaded_model,
    resume_text,
    input_tokenizer,
    output_tokenizer,
    max_input_len
)

print("Generated Interview Question:")
print(question)


In [None]:
# Save the model in the modern Keras format
model.save('/content/drive/MyDrive/interview_model.keras')


In [None]:
train_df

Unnamed: 0,input_text,target_text
0,Role: django developer\nSkills: r\nResume: pyt...,How have you used r in your projects?\nWhat ch...
1,"Role: django developer\nSkills: tableau, ai, g...",How have you used ai in your projects?\nHow ha...
2,"Role: django developer\nSkills: ai, sql, r, op...",What challenges did you face while working wit...
3,"Role: django developer\nSkills: mongodb, sas, ...",What challenges did you face while working wit...
4,Role: django developer\nSkills: cloud computin...,How have you used python programming in your p...
...,...,...
8229,"Role: database administrator\nSkills: sql, sas...",How have you used sas in your projects?\nWhat ...
8230,Role: database administrator\nSkills: communic...,How have you used communication in your projec...
8231,"Role: database administrator\nSkills: sas, ai,...",What challenges did you face while working wit...
8232,"Role: database administrator\nSkills: sas, tab...",How have you used ai in your projects?\nWhat c...


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 1. Extract input and output texts
train_input_texts = train_df['input_text'].astype(str).tolist()
train_output_texts = train_df['target_text'].astype(str).tolist()

# 2. Fit input tokenizer
input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(train_input_texts)
input_sequences = input_tokenizer.texts_to_sequences(train_input_texts)

# 3. Fit output tokenizer
output_tokenizer = Tokenizer()
output_tokenizer.fit_on_texts(train_output_texts)
output_sequences = output_tokenizer.texts_to_sequences(train_output_texts)

# 4. Get max input sequence length
max_input_len = max(len(seq) for seq in input_sequences)

# Optional: Pad input and output sequences if you want to use them for training
encoder_input_data = pad_sequences(input_sequences, maxlen=max_input_len, padding='post')

# Save input_tokenizer, output_tokenizer, and max_input_len for future use
import pickle

with open('/content/drive/MyDrive/input_tokenizer.pkl', 'wb') as f:
    pickle.dump(input_tokenizer, f)

with open('/content/drive/MyDrive/output_tokenizer.pkl', 'wb') as f:
    pickle.dump(output_tokenizer, f)

with open('/content/drive/MyDrive/max_input_len.txt', 'w') as f:
    f.write(str(max_input_len))


In [None]:
# Load saved assets
import pickle

with open('/content/drive/MyDrive/input_tokenizer.pkl', 'rb') as f:
    input_tokenizer = pickle.load(f)

with open('/content/drive/MyDrive/output_tokenizer.pkl', 'rb') as f:
    output_tokenizer = pickle.load(f)

with open('/content/drive/MyDrive/max_input_len.txt', 'r') as f:
    max_input_len = int(f.read().strip())


In [None]:
!pip install python-docx

from docx import Document

# Load the .docx file
doc = Document("/content/KondetiSandeep_resume.docx")

# Join all paragraphs into one string
resume = "\n".join([para.text for para in doc.paragraphs])

# Now you can pass `full_text` to your function
print(resume)


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2
    

Educational Qualification

Academic Achievements
Smart India Hackathon college level appreciation certificate.
Best Team in Deep Learning Workshop conducted at our college by ML Academy in March 2023.  
Internship Experience
Machine Learning Intern at CloudKarya, Inc.: 			             Feb 2024 - Apr 2024
Underwent a Machine Learning Internship program at CloudKarya, Inc. in Visakhapa

In [None]:
ques=generate_question(loaded_model, resume, input_tokenizer, output_tokenizer, max_input_len,skills_list)

In [None]:
ques

'What challenges do you face while working for your project?\n How have you used communication in your projects?\n How have you used ai in your projects?'

##Pretrained Model

In [None]:
def generate_questions_from_resume(resume_text, num_questions_per_skill=2):
    skills = extract_skills(resume_text)

    all_questions = []

    for skill in skills:
        prompt = f"generate interview questions on {skill}"
        inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)

        outputs = model.generate(
            inputs,
            max_length=100,
            num_return_sequences=num_questions_per_skill,
            do_sample=True,
            temperature=0.9,
            top_p=0.95
        )

        skill_questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        all_questions.extend(skill_questions)

    return all_questions


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model
model = T5ForConditionalGeneration.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
tokenizer = T5Tokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")

def generate_questions_t5(input_text, num_questions=5):
    prompt = "generate interview questions: " + input_text.strip()
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)

    # Generate multiple sequences
    outputs = model.generate(
        inputs,
        max_length=100,
        num_return_sequences=num_questions,
        do_sample=True,                # Enable sampling
        temperature=0.9,               # Controls creativity
        top_p=0.95,                    # Nucleus sampling
    )

    # Decode all questions
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
def format_resume_input(row,skills_list):
    skills = extract_skills(row,skills_list)
    return (
        f"You are an interviewer preparing for the role of Data Analytics role.\n"
        f"Based on the candidate's resume and on the role , ask a diverse set of interview questions.\n\n"
        # f"Resume Summary:\n{row}\n"
        f"Ask questions on the {skills}, soft skills, domain knowledge, and project experience ask the questions with proper meaning ."
        )
# sample_row = it_resumes.iloc[3]  # or any other index
Input = format_resume_input(resume,skills_list)


In [None]:
# Use the input column from your paired_df
# sample_input = resume[['Resume_str','Resume_html','Category']].iloc[7]

questions = generate_questions_t5(Input, num_questions=5)

for idx, q in enumerate(questions, 1):
    print(f"{idx}. {q}")


1. question: Is the job search for Data Analytics related?
2. question: What questions should you ask in preparing to interview a candidate?
3. question: What questions should you use in the interview process?
4. question: What type of questions must you ask?
5. question: What are you an interviewer preparing for a Data Analytics role?


In [None]:
skills = extract_skills(resume,skills_list)

In [None]:
skills

['data analysis',
 'r',
 'version control',
 'data visualization',
 'ai',
 'communication',
 'statistical analysis']

In [None]:
print(resume)

    

Educational Qualification

Academic Achievements
Smart India Hackathon college level appreciation certificate.
Best Team in Deep Learning Workshop conducted at our college by ML Academy in March 2023.  
Internship Experience
Machine Learning Intern at CloudKarya, Inc.: 			             Feb 2024 - Apr 2024
Underwent a Machine Learning Internship program at CloudKarya, Inc. in Visakhapatnam. The training involved gaining the practical experience on machine learning and valuable insights on data analysis. Worked on the design and development of a web application that will be utilized for desktop and mobile platforms, deployed on GCP. 

Other Projects
Automail using ChatGPT:                                                         			      Mar 2024
Developed a simple application 'Automail' an automated email generation system that processes company and recipient data inputs in the form of prompts to produce personalized emails in various tones. The system utilizes natural language prom

##Another pre trained model

In [None]:
!pip install transformers




In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-base-qg-hl")
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-base-qg-hl")

def generate_questions_from_resume(resume_text, skills, num_questions_per_skill):
    questions = []

    for skill in skills:
        if skill.lower() in resume_text.lower():
            # Highlight the skill in the resume
            highlighted_text = resume_text.replace(skill, f"<hl> {skill} <hl>")
            prompt = f"generate question: {highlighted_text}"

            inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
            outputs = model.generate(inputs, max_length=64, num_return_sequences=num_questions_per_skill, do_sample=True)

            skill_questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            questions.extend(skill_questions)

    return questions


In [None]:

questions = generate_questions_from_resume(resume, skills, num_questions_per_skill=2)

for idx, q in enumerate(questions, 1):
    print(f"{idx}. {q}")


1. What type of skills could one gain from a Machine Learning Intern?
2. What type of learning did the trainees gain valuable insights on during their training?
3. What does “Automail” stand for?
4. What else did it involve?
5. What was the job title of the Gemini ML holder?
6. What do you manage in Git?
7. What was the primary function of the T20 Sense platform?
8. Matplotlib, Plotly, and other technologies integrate into the platform for what purpose?
9. What is the goal of the software that did this internship include training others to understand and work with machine learning?
10. What is its basic description?
11. What kind of business would Automail generate?
12. What does Automail generate customized business forms?
13. What does Python work on?
14. Python enables users to explore player and team levels beyond traditional scorecards. What other ability does T20 Sense have?


##Metrics

In [None]:
def skill_match_score(skills, questions):
    match_counts = [any(skill.lower() in q.lower() for skill in skills) for q in questions]
    return sum(match_counts) / len(questions) if questions else 0

print("Skill Match Accuracy:", skill_match_score(skills, questions))


Skill Match Accuracy: 0.8571428571428571


In [None]:
# from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# tokenizer = T5Tokenizer.from_pretrained("t5-base")
# model = T5ForConditionalGeneration.from_pretrained("t5-base")

def calculate_perplexity(text):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
    return torch.exp(loss).item()

# text = "generate interview questions: Explain your experience with Python and Django."
print("Perplexity:", calculate_perplexity(questions))


Perplexity: 2.2151262760162354
