<a href="https://colab.research.google.com/github/mausombi/projectseekhan/blob/main/mixednotebook_test2_seekhanpretrainedrunningattempt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary packages
!pip install GPUtil
!pip install -q git+https://github.com/huggingface/peft.git
!pip install -q -i https://pypi.org/simple/ bitsandbytes
!pip install -q transformers==4.30
!pip install -q datasets

# Import necessary libraries
import torch
import GPUtil
import os
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import BitsAndBytesConfig, LlamaTokenizer
from huggingface_hub import notebook_login
from datasets import load_dataset
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from datetime import datetime

# Check if running in Google Colab and enable custom widget manager
if 'COLAB_GPU' in os.environ:
    from google.colab import output
    output.enable_custom_widget_manager()

# Set the environment to UTF-8 explicitly
import locale
os.environ['LANG'] = 'en_US.UTF-8'
os.environ['LC_ALL'] = 'en_US.UTF-8'
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

# Verify if the locale is set correctly
print("Locale is set to:", locale.getpreferredencoding())

Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
  Created wheel for GPUtil: filename=GPUtil-1.4.0-py3-none-any.whl size=7392 sha256=afaaf4aa0732291019a588b8c62406b1e2b4e22c19c9d20ea7538ab9c3e6019c
  Stored in directory: /root/.cache/pip/wheels/a9/8a/bd/81082387151853ab8b6b3ef33426e98f5cbfebc3c397a9d4d0
Successfully built GPUtil
Installing collected packages: GPUtil
Successfully installed GPUtil-1.4.0
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.

In [None]:
#Before Running the below block,
# add a secret "hftoken_seekhan" in secrets section.

In [None]:
import os
from huggingface_hub import login
from google.colab import userdata
# Retrieve the token from secrets or environment variable
HF_TOKEN = userdata.get('hftoken_seekhan')

# Use the token to login
if HF_TOKEN:
    login(token=HF_TOKEN)
    print("Successfully logged in to Hugging Face!")
else:
    raise ValueError("Hugging Face token not found. Please check your secrets.")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Successfully logged in to Hugging Face!


In [None]:

import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "mausombi/seekhan_ft"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the PEFT model ID
peft_model_id = "divyabindu77/seekhan_model"

# Load the PEFT configuration
config = PeftConfig.from_pretrained(peft_model_id)

# Load the base model with 8-bit precision and automatic device mapping
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    load_in_8bit=True,
    device_map='auto'
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the LoRA (PEFT) model
model = PeftModel.from_pretrained(model, peft_model_id)




In [None]:
# Define the question or topic
question = "Allu Arjun"

# Adjusted labels based on the content of the uploaded file
labels = [
    "data structures and algorithms",
    "programming concepts",
    "software engineering",
    "databases and data management",
    "machine learning and data science"
]

# Initialize the zero-shot classifier
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    framework="pt"  # Force PyTorch
)

# Classify the topic
classification = classifier(question, labels)

# Check if the topic belongs to computer science
if classification["labels"][0] in labels and classification["scores"][0] > 0.7:
    # If it belongs to computer science, proceed with question generation
    eval_prompt = f"The topic is this- {question}. Generate 30 Multiple choice questions on it.\n\n"

    # Tokenize the prompt without returning token_type_ids
    promptTokenized = tokenizer(
        eval_prompt,
        return_tensors="pt",
        return_token_type_ids=False  # Disable token_type_ids
    ).to("cuda")

    # Set the model to evaluation mode
    model.eval()

    # Generate the response without token_type_ids
    with torch.no_grad():
        output_tokens = model.generate(
            **promptTokenized,
            max_new_tokens=2048,
            temperature=0.8
        )
        generated_text = tokenizer.decode(
            output_tokens[0],
            skip_special_tokens=True
        )
        print(generated_text)

    # Clear CUDA cache
    torch.cuda.empty_cache()

else:
    # If not related to the topics, print an appropriate message
    print(f"Sorry, the topic '{question}' is out of the domain of computer science. Try again with a relevant topic.")


In [None]:
# Define the question or topic
question = "Polymorphism"

# Adjusted labels based on the content of the uploaded file
labels = [
    "data structures and algorithms",
    "programming concepts",
    "software engineering",
    "databases and data management",
    "machine learning and data science"
]

# Classify the topic
classification = classifier(question, labels)

# Check if the topic belongs to computer science
if classification["labels"][0] in labels and classification["scores"][0] > 0.3:
    # If it belongs to computer science, proceed with question generation
    eval_prompt = f"The topic is this- {question}. Generate 10 Multiple choice questions on it with options and answers.\n\n"

    # Tokenize the prompt without returning token_type_ids
    promptTokenized = tokenizer(
        eval_prompt,
        return_tensors="pt",
        return_token_type_ids=False  # Disable token_type_ids
    ).to("cuda")

    # Set the model to evaluation mode
    model.eval()

    # Generate the response without token_type_ids
    with torch.no_grad():
      output_tokens = model.generate(
            **promptTokenized,
            max_new_tokens=2048,
            temperature=0.8
        )
      generated_text = tokenizer.decode(
            output_tokens[0],
            skip_special_tokens=True
        )
      print(generated_text)

    # Clear CUDA cache
    torch.cuda.empty_cache()

else:
    # If not related to the topics, print an appropriate message
    print(f"Sorry, the topic '{question}' is out of the domain of computer science. Try again with a relevant topic.")


In [None]:
# Define your question
question = "Allu Arjun"

# Format the question into the prompt
eval_prompt = f"The topic is this- {question}, First, check if the {question} is out of the domain of computer science. If it is outside the domain of Computer Science, Just say that it is out of bound and do not generate any . If it is in the domain of computer science, You are supposed to generate 30 Multiple choice questions on it. \n\n"

# Tokenize the prompt without returning token_type_ids
promptTokenized = tokenizer(
    eval_prompt,
    return_tensors="pt",
    return_token_type_ids=False  # Disable token_type_ids
).to("cuda")

# Set the model to evaluation mode
model.eval()

# Generate the response without token_type_ids
with torch.no_grad():
    output_tokens = model.generate(
        **promptTokenized,
        max_new_tokens=2048,
        # repetition_penalty=1.2,
        #num_beans=5,
        temperature=0.8
    )
    generated_text = tokenizer.decode(
        output_tokens[0],
        skip_special_tokens=True
    )
    print(generated_text)

# Clear CUDA cache
torch.cuda.empty_cache()


SyntaxError: unterminated string literal (detected at line 28) (<ipython-input-17-7e5b4b29e14c>, line 28)

In [None]:
torch.cuda.empty_cache()


In [None]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [None]:
# Define the question or topic
question = "Python"

# Adjusted labels based on the content of the uploaded file
labels = ["data structures and algorithms",
          "programming concepts",
          "software engineering",
          "databases and data management",
          "machine learning and data science"]

# Classify the topic
classification = classifier(question, labels)


In [None]:
 classification["scores"][0]

In [None]:


# Define the question or topic
question = "Allu Arjun"

# Adjusted labels based on the content of the uploaded file
labels = ["data structures and algorithms",
          "programming concepts",
          "software engineering",
          "databases and data management",
          "machine learning and data science"]

# Classify the topic
classification = classifier(question, labels)

# Check if the topic belongs to computer science
if classification["labels"][0] in labels and classification["scores"][0] > 0.7:
    # If it belongs to computer science, proceed with question generation
    eval_prompt = f"The topic is this- {question}. Generate 30 Multiple choice questions on it.\n\n"

    # Tokenize the prompt without returning token_type_ids
    promptTokenized = tokenizer(
        eval_prompt,
        return_tensors="pt",
        return_token_type_ids=False  # Disable token_type_ids
    ).to("cuda")

    # Set the model to evaluation mode
    model.eval()

    # Generate the response without token_type_ids
    with torch.no_grad():
        output_tokens = model.generate(
            **promptTokenized,
            max_new_tokens=2048,
            temperature=0.8
        )
        generated_text = tokenizer.decode(
            output_tokens[0],
            skip_special_tokens=True
        )
        print(generated_text)

    # Clear CUDA cache
    torch.cuda.empty_cache()

else:
    # If not related to the topics, print an appropriate message
    print(f"Sorry, the topic '{question}' is out of the domain of computer science. Try again with a relevant topic.")


In [None]:
pip install tensorflow==2.11 keras==2.11


In [None]:
from transformers import pipeline

# Initialize the zero-shot-classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define the input topic with added context
question = "Binary Search"

# Broadened and more relevant labels
labels = [
    "programming",
    "software development",
    "data science",
    "computer science concepts",
    "IT and software"
]

# Classify the topic and get the top label with its score
classification = classifier(question, labels)

# Print the classification results for better understanding
print("Classification Result:", classification)

# Check if the highest-scoring label meets the threshold
if classification["scores"][0] > 0.4:  # Adjusted score threshold
    print(f"The topic '{question}' belongs to the '{classification['labels'][0]}' domain.")
else:
    print(f"Sorry, the topic '{question}' is out of the domain of computer science. Try again with a relevant topic.")


In [None]:
# Define the input topic with added context
question = "Allu Arjun"

# Broadened and more relevant labels
labels = [
    "programming",
    "software development",
    "data science",
    "computer science concepts",
    "IT and software"
]

# Classify the topic and get the top label with its score
classification = classifier(question, labels)

# Print the classification results for better understanding
print("Classification Result:", classification)

# Check if the highest-scoring label meets the threshold
if classification["scores"][0] > 0.4:  # Adjusted score threshold
    print(f"The topic '{question}' belongs to the '{classification['labels'][0]}' domain.")
else:
    print(f"Sorry, the topic '{question}' is out of the domain of computer science. Try again with a relevant topic.")


In [None]:
!pip install --upgrade tensorflow keras
!pip install transformers torch


In [None]:
import locale
import os

# Set the environment to UTF-8 explicitly
os.environ['LANG'] = 'en_US.UTF-8'
os.environ['LC_ALL'] = 'en_US.UTF-8'
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

# Verify if the locale is set correctly
print("Locale is set to:", locale.getpreferredencoding())

# Re-run your main logic
from transformers import pipeline

classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    framework="pt"  # Force PyTorch
)

candidate_labels = ["education", "politics", "sports"]
text = "KMIT is hosting an interactive learning event with quizzes."

result = classifier(text, candidate_labels)
print(result)


In [None]:
# Step 1: Install necessary libraries
# !pip install transformers torch

# Step 2: Import required modules
#from transformers import pipeline
#import torch
os.environ['LANG'] = 'en_US.UTF-8'
os.environ['LC_ALL'] = 'en_US.UTF-8'
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
# Step 3: Initialize the zero-shot classification pipeline
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    framework="pt"
    # device=0 if torch.cuda.is_available() else -1  # Use GPU if available, otherwise CPU
)

### ENTER YOUR QUESTION BELOW

question = "What is data abstraction?"

# Format the question
eval_prompt = f"{question}\n\n"

promptTokenized = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**promptTokenized, max_new_tokens = 1024)[0], skip_special_tokens=True))
torch.cuda.empty_cache()