In [None]:
!pip install transformers datasets torch pandas datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
from google.colab import drive
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TrainerCallback
from transformers.trainer_utils import get_last_checkpoint
from datasets import Dataset
import torch
import os
import pandas as pd
import random
from sklearn.model_selection import train_test_split

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Occupations.csv', low_memory=False, encoding='latin-1')  # Make sure the correct file path

# Remove 'Unnamed' columns if they exist
df = df.loc[:, ~df.columns.str.contains('^Unnamed', case=False)]

# Optimize column names for consistency
df.columns = df.columns.str.strip().str.lower()

# Remove rows with missing values
df.dropna(subset=['job', 'detailed work activity', 'interest description', 'task', 'skills'], inplace=True)

# Display column names for debugging
print("DataFrame Columns:", df.columns)

DataFrame Columns: Index(['job', 'detailed work activity', 'interest description', 'task',
       'skills'],
      dtype='object')


In [None]:
# Function to create a randomized corpus for training
def create_corpus_with_randomization(df):
    corpus = set()  # Use a set to avoid duplicates

    for _, row in df.iterrows():
        job = row['job']

        # Randomize task, detailed work activity, interest description, and skills
        tasks = row['task'].split(';') if pd.notna(row['task']) else []
        detailed_work_activities = row['detailed work activity'].split(';') if pd.notna(row['detailed work activity']) else []
        interest_descriptions = row['interest description'].split(';') if pd.notna(row['interest description']) else []
        skills_list = row['skills'].split(';') if pd.notna(row['skills']) else []

        # Randomly select one item from each category
        task = random.choice(tasks).strip() if tasks else None
        detailed_work_activity = random.choice(detailed_work_activities).strip() if detailed_work_activities else None
        interest_description = random.choice(interest_descriptions).strip() if interest_descriptions else None
        skill = random.choice(skills_list).strip() if skills_list else None

        # Filter overly generic responses
        if task and "task" not in task.lower():
            corpus.add(f"Q: What is the task for {job}?\nA: {task}")
        if detailed_work_activity and "activity" not in detailed_work_activity.lower():
            corpus.add(f"Q: What is the detailed work activity for {job}?\nA: {detailed_work_activity}")
        if interest_description and "interest" not in interest_description.lower():
            corpus.add(f"Q: What is the interest description for {job}?\nA: {interest_description}")
        if skill and "skill" not in skill.lower():
            corpus.add(f"Q: What skills are needed for {job}?\nA: {skill}")

        # Add a generic question-answer pair, ensuring it's meaningful
        if skill and len(skill.split()) > 3:  # Exclude overly short or generic skills
            corpus.add(f"Q: Can you tell me more about being a {job}?\nA: {skill}")

    return list(corpus)

# Create corpus
corpus = create_corpus_with_randomization(df)

# Print sample corpus for debugging
for entry in corpus[:5]:  # Displaying only the first 5 for brevity
    print(entry)

Q: What is the task for Statistician?
A: Analyze and interpret statistical data to identify significant differences in relationships among sources of information.
Q: What skills are needed for Geographic Information Systems Technologists and Technicians?
A: Proficiency in programming languages such as Python and SQL is important for automating tasks, performing spatial analysis, and managing databases.
Q: What is the detailed work activity for Network and Computer Systems Administrator?
A: Resolve computer software problems.
Q: What is the detailed work activity for Database Administrator?
A: Implement security measures for computer or information systems.
Q: What is the task for Computer Network Architects?
A: Coordinate installation of new equipment.


In [None]:
df.to_csv('/content/drive/MyDrive/Occupations.csv', index=False)


In [None]:
# Prepare training data with general information based on job details
train_data = [
    {
        "input_text": f"Job: {row['job']}, Activity: {row['detailed work activity']}, Interests: {row['interest description']}, Task: {row['task']}",
        "target_text": f"Skills: {row['skills']}"
    }
    for _, row in df.iterrows()
]

# Display sample training data for debugging
print("Sample Training Data:", train_data[:5])


Sample Training Data: [{'input_text': 'Job: Blockchain Engineer, Activity: Design integrated computer systems., Interests: Work involves following procedures and regulations to organize information or data, typically in a business setting. Conventional occupations are often associated with office work, accounting, mathematics/statistics, information technology, finance, or human resources., Task: Assess blockchain threats, such as untested code and unprotected keys.', 'target_text': 'Skills: Understanding the fundamental structure of blockchain systems, including distributed ledger technology, consensus mechanisms, and network protocols is crucial.\xa0'}, {'input_text': 'Job: Blockchain Engineer, Activity: Discuss design or technical features of products or services with technical personnel., Interests: Work involves studying and researching non-living objects, living organisms, disease or other forms of impairment, or human behavior. Investigative occupations are often associated with

In [None]:
# Convert train_data (a list of dictionaries) to a Hugging Face Dataset
dataset = Dataset.from_list(train_data)

# Split dataset into train and validation sets
train_test = dataset.train_test_split(test_size=0.1)
train_dataset = train_test["train"]
val_dataset = train_test["test"]

# Verify the splits
print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Validation Dataset Size: {len(val_dataset)}")

Train Dataset Size: 140
Validation Dataset Size: 16


In [None]:
# Load GPT-2 model and tokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer # Import necessary classes
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set pad_token to eos_token (end-of-sequence token)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the corpus
train_texts, val_texts = train_test_split(corpus, test_size=0.1)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# Create a custom dataset class for tokenized inputs
class ChatbotDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = item["input_ids"].clone()  # Add labels for loss calculation
        return item

# Create datasets for training and validation
train_dataset = ChatbotDataset(train_encodings)
val_dataset = ChatbotDataset(val_encodings)

In [None]:
tokenizer.pad_token = tokenizer.eos_token  # Ensures proper handling of padding

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/chatbot_model",  # Directory for saving checkpoints
    save_strategy="steps",                              # Save a checkpoint every few steps
    save_steps=500,                                     # Save every 500 steps
    eval_strategy="steps",                              # Evaluate every few steps
    eval_steps=500,                                     # Evaluate at the same frequency as saving
    save_total_limit=2,                                 # Keep only the last 2 checkpoints
    logging_dir="/content/drive/MyDrive/logs",          # Directory for logs
    logging_steps=100,                                  # Frequency of logging
    learning_rate=5e-5,                                 # Initial learning rate
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=50,                                # Number of epochs
    weight_decay=0.01,                                  # Regularization
    load_best_model_at_end=True,                        # Automatically load the best model
    metric_for_best_model="loss",                       # Use loss to determine the best model
    greater_is_better=False,                            # Smaller loss is better
    report_to="none",                                   # Prevent extra reporting
)

In [None]:
# Define a custom callback for checkpoint saving
class SaveCheckpointWithLMHeadCallback(TrainerCallback):
    def __init__(self, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer

    def on_save(self, args, state, control, **kwargs):
        # Tie weights before saving
        model.tie_weights()

        # Save the model and tokenizer manually
        checkpoint_dir = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
        print(f"Saving checkpoint at {checkpoint_dir}")
        model.save_pretrained(checkpoint_dir)
        self.tokenizer.save_pretrained(checkpoint_dir)

# Debug callback to log device placement
class DebugCallback(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        print("Model device:", next(model.parameters()).device)  # Log once at the start

    def on_step_begin(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0:  # Log every 100 steps
            print(f"Step {state.global_step}: Model device: {next(model.parameters()).device}")


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[SaveCheckpointWithLMHeadCallback(tokenizer), DebugCallback()],
)

  trainer = Trainer(


In [None]:
# Hugging face token = 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
# Wandb token = d759f524f2041f73be26619ae0c6f030b65f6112

# Resume training logic from the last checkpoint
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint:
    print(f"Resuming from checkpoint: {last_checkpoint}")
    model = AutoModelForCausalLM.from_pretrained(last_checkpoint)
    model.tie_weights()  # Re-tie weights after loading
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    trainer.train()

# Save the final model and tokenizer explicitly
model_save_path = "/content/drive/MyDrive/chatbot_model"
print("Saving the final model...")
model.tie_weights()  # Ensure weights are tied
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print("Final model and tokenizer saved.")

Model device: cuda:0
Step 0: Model device: cuda:0


Step,Training Loss,Validation Loss
500,0.3853,0.486361
1000,0.2105,0.435493
1500,0.1458,0.460901
2000,0.1203,0.476775
2500,0.1086,0.479406
3000,0.1022,0.498116
3500,0.0984,0.503284
4000,0.0944,0.500005
4500,0.092,0.513683
5000,0.0902,0.526343


Step 100: Model device: cuda:0
Step 200: Model device: cuda:0
Step 300: Model device: cuda:0
Step 400: Model device: cuda:0
Saving checkpoint at /content/drive/MyDrive/chatbot_model/checkpoint-500
Step 500: Model device: cuda:0
Step 600: Model device: cuda:0
Step 700: Model device: cuda:0
Step 800: Model device: cuda:0
Step 900: Model device: cuda:0
Saving checkpoint at /content/drive/MyDrive/chatbot_model/checkpoint-1000
Step 1000: Model device: cuda:0
Step 1100: Model device: cuda:0
Step 1200: Model device: cuda:0
Step 1300: Model device: cuda:0
Step 1400: Model device: cuda:0
Saving checkpoint at /content/drive/MyDrive/chatbot_model/checkpoint-1500
Step 1500: Model device: cuda:0
Step 1600: Model device: cuda:0
Step 1700: Model device: cuda:0
Step 1800: Model device: cuda:0
Step 1900: Model device: cuda:0
Saving checkpoint at /content/drive/MyDrive/chatbot_model/checkpoint-2000
Step 2000: Model device: cuda:0
Step 2100: Model device: cuda:0
Step 2200: Model device: cuda:0
Step 2300:

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Saving the final model...
Final model and tokenizer saved.


In [None]:
model.save_pretrained('/content/drive/MyDrive/chatbot_model')
tokenizer.save_pretrained('/content/drive/MyDrive/chatbot_model')


('/content/drive/MyDrive/chatbot_model/tokenizer_config.json',
 '/content/drive/MyDrive/chatbot_model/special_tokens_map.json',
 '/content/drive/MyDrive/chatbot_model/vocab.json',
 '/content/drive/MyDrive/chatbot_model/merges.txt',
 '/content/drive/MyDrive/chatbot_model/added_tokens.json',
 '/content/drive/MyDrive/chatbot_model/tokenizer.json')

In [None]:
from google.colab import drive
from IPython.display import display, HTML, Javascript
import random
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers torcha



In [None]:
model_path = '/content/drive/MyDrive/chatbot_model/checkpoint-1000'
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Set the pad_token to eos_token for compatibility
tokenizer.pad_token = tokenizer.eos_token

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Occupations.csv', low_memory=False, encoding='latin-1')

# Optional: Clean the dataframe as you did in the training code
df = df.loc[:, ~df.columns.str.contains('^Unnamed', case=False)]
df.columns = df.columns.str.strip().str.lower()
df.dropna(subset=['job', 'detailed work activity', 'interest description', 'task', 'skills'], inplace=True)


In [None]:
import random
import pandas as pd

# Initialize conversation state
conversation_state = {}

def generate_career_guidance(prompt):
    global conversation_state

    # Handle the quit condition
    if 'quit' in prompt.lower():
        conversation_state.clear()
        return "Goodbye! All the best for your future!"

    # Normalize user input for better matching
    normalized_prompt = prompt.lower()

    # Handle recommendations for careers
    if 'recommend' in normalized_prompt or 'suggest' in normalized_prompt:
        return recommend_careers()

    # Check if the user is asking for skills, tasks, or interests directly
    if 'skills' in normalized_prompt or 'task' in normalized_prompt or 'interest' in normalized_prompt:
        # If the job is known, directly provide the skills, tasks, or interests
        if 'job' in conversation_state:
            job = conversation_state['job']
            if 'skills' in normalized_prompt:
                return f"The skills for {job} are: {get_skills_for_job(job)}"
            elif 'task' in normalized_prompt:
                return f"The tasks for {job} are: {get_task_for_job(job)}"
            elif 'interest' in normalized_prompt:
                return f"The interests for {job} are: {get_interest_for_job(job)}"
        else:
            return "Please tell me which job you're interested in first."

    # Attempt to find a matching job in the user input
    matched_job = None
    for job in df['job'].unique():
        if job.lower() in normalized_prompt:
            matched_job = job
            break

    # If a job is detected and no specific query, respond with a general introduction
    if matched_job:
        conversation_state['job'] = matched_job
        return f"Being a {matched_job} is all about {get_general_description_for_job(matched_job)}. Would you like to know about the tasks, skills, or interests for this role?"

    # If no job is detected but the context exists, retrieve context
    if 'job' in conversation_state:
        job = conversation_state['job']
        return f"Could you specify whether you'd like to know about skills, tasks, or interests for the role of {job}?"

    # If no job or specific query is found, prompt the user for further clarification
    return "Could you specify which job you're interested in first?"

def recommend_careers():
    # Select random jobs from the dataset
    recommended_jobs = random.sample(list(df['job'].unique()), k=3)  # Adjust `k` for more or fewer recommendations
    return f"Here are some career suggestions for you: {', '.join(recommended_jobs)}. Which one interests you?"

def get_task_for_job(job):
    # Retrieve the task for the specific job from your DataFrame
    row = df[df['job'].str.lower() == job.lower()].iloc[0]
    return row['task'] if pd.notna(row['task']) else "Task information not available."

def get_interest_for_job(job):
    # Retrieve the interest description for the specific job from your DataFrame
    row = df[df['job'].str.lower() == job.lower()].iloc[0]
    return row['interest description'] if pd.notna(row['interest description']) else "Interest information not available."

def get_skills_for_job(job):
    # Retrieve the skills for the specific job from your DataFrame
    row = df[df['job'].str.lower() == job.lower()].iloc[0]
    return row['skills'] if pd.notna(row['skills']) else "Skills information not available."

# Initialize conversation and start with career recommendations
print("Chatbot: Hi! I'm here to help you choose a career in computer science. Type 'quit' to exit.")
response = recommend_careers()
print(f"Chatbot: {response}")

while True:
    user_input = input("You: ")
    response = generate_career_guidance(user_input)
    print(f"Chatbot: {response}")
    if 'Goodbye' in response:
        break  # Exit the loop if the user says "quit"


Chatbot: Hi! I'm here to help you choose a career in computer science. Type 'quit' to exit.
Chatbot: Here are some career suggestions for you: Health Informatics Specialist, Business Intelligence Analyst, Web and Digital Interface Designer. Which one interests you?
You: I wanna be a web developer.
Chatbot: Being a Web Developer is all about creating and maintaining websites, ensuring they are user-friendly, functional, and visually appealing.. Would you like to know about the tasks, skills, or interests for this role?
You: What are the tasks then?
Chatbot: The tasks for Web Developer are: Write supporting code for Web applications or Web sites.
You: What about the skills needed?
Chatbot: The skills for Web Developer are: Mastery of HTML (Hypertext Markup Language) and CSS (Cascading Style Sheets) is essential for creating and styling web pages, forming the foundation of web development projects.
You: Can you recommend other cs related jobs?
Chatbot: Here are some career suggestions for

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def evaluate_chatbot_responses(test_queries, gold_standard, model, tokenizer):
    y_true = []  # Ground truth (1 if correct, 0 if incorrect)
    y_pred = []  # Predictions (1 if correct, 0 if incorrect)

    print("Starting Evaluation...\n")

    # Initialize vectorizer for cosine similarity
    vectorizer = TfidfVectorizer().fit(test_queries + gold_standard)

    for i, query in enumerate(test_queries):
        chatbot_response = generate_career_guidance(query)  # Get the response from the chatbot
        print(f"User Query: {query}")
        print(f"Expected: {gold_standard[i]}")
        print(f"Chatbot Response: {chatbot_response}\n")

        # Compute cosine similarity between the chatbot's response and the gold standard
        query_vec = vectorizer.transform([chatbot_response]).toarray()
        gold_vec = vectorizer.transform([gold_standard[i]]).toarray()

        similarity = cosine_similarity(query_vec, gold_vec)[0][0]

        # Set a threshold for considering responses as correct (e.g., similarity > 0.8)
        if similarity > 0.8:
            y_true.append(1)
            y_pred.append(1)
        else:
            y_true.append(1)  # Ground truth assumes correctness for this example
            y_pred.append(0)

    # Calculate performance metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)

    print("\nPerformance Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")

    return {"accuracy": accuracy, "precision": precision, "recall": recall}

test_queries = [
    "What are the skills of a Software Developer?",
    "Tell me about the role of a Penetration Tester.",
    "What tasks does a Database Administrator perform?",
    "What is the role of a Software Developer?",
    "Can you recommend some jobs related to network security?"
]

gold_standard = [
    "Skills: Proficiency in multiple programming languages such as Java, Python, C++, JavaScript, and C# is essential for developing software applications and systems.",  # Software Developer skills
    "Penetration Tester: Tests systems for vulnerabilities, conducts ethical hacking, and helps secure networks.",
    "Tasks: Maintain databases, ensure data integrity, perform backups, and manage database security.",
    "Skills: Software design, programming, debugging, testing, and proficiency in various programming languages.",
    "Here are some career suggestions related to network security: Penetration Tester, Network Engineer, Security Analyst."
]


metrics = evaluate_chatbot_responses(test_queries, gold_standard, model, tokenizer)


Starting Evaluation...

User Query: What are the skills of a Software Developer?
Expected: Skills: Proficiency in multiple programming languages such as Java, Python, C++, JavaScript, and C# is essential for developing software applications and systems.
Chatbot Response: The skills for Software Developer are: Proficiency in multiple programming languages such as Java, Python, C++, JavaScript, and C# is essential for developing software applications and systems.

User Query: Tell me about the role of a Penetration Tester.
Expected: Penetration Tester: Tests systems for vulnerabilities, conducts ethical hacking, and helps secure networks.
Chatbot Response: Being a Penetration Tester is all about testing systems for vulnerabilities to prevent potential breaches.. Would you like to know about the tasks, skills, or interests for this role?

User Query: What tasks does a Database Administrator perform?
Expected: Tasks: Maintain databases, ensure data integrity, perform backups, and manage da