In [1]:
!pip install torch transformers python-dotenv

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os
from dotenv import load_dotenv
from kaggle_secrets import UserSecretsClient
import re
import json
from transformers import pipeline

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


In [2]:
user_secrets = UserSecretsClient()
load_dotenv()
hf_token = user_secrets.get_secret("HUGGINGFACE_API_TOKEN")

# Check if token is loaded
if not hf_token:
    raise ValueError("❌ Hugging Face API token not found! Make sure it's set as a Kaggle secret.")


In [3]:
# Load Gemma model and tokenizer
model_name = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto", token=hf_token)

print("✅ Model loaded successfully!")

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

✅ Model loaded successfully!


In [4]:
with open("/kaggle/input/extractedbook/ExtractedBook.txt", "r", encoding="utf-8") as file:
    text = file.read()

print("First 500 characters of input file:\n", text[:500])

First 500 characters of input file:
 
2            CHAPTER ONE 
Introduction
Counseling  students  can  begin  to  acquire  a  counseling  style  tailored  to  their  own  
personality  by  familiarizing  themselves  with  the  major  approaches  to  therapeu-
tic  practice.  This  book  surveys  11  approaches  to  counseling  and  psychotherapy,  
presenting the key concepts of each approach and discussing features such as the 
therapeutic  process  (including  goals),  the  client–therapist  relationship,  and  spe-
cific proced


In [5]:
# Load Gemma model for keyword generation
keyword_generator = pipeline("text-generation", model="google/gemma-2b")

# Function to remove unwanted characters
def clean_text(text):
    cleaned = re.sub(r'[^A-Za-z0-9\s\.,;:\'\"\?\!\(\)\-]', '', text)
    return cleaned

# Function to split text into sections
def split_into_sections(text):
    lines = text.splitlines()
    sections = []
    current_section = []
    for line in lines:
        if re.search(r'^\s*(?:\d+\s+)?CHAPTER\s+\w+', line, re.IGNORECASE):
            if current_section:
                sections.append("\n".join(current_section).strip())
            current_section = [line.strip()]
        else:
            current_section.append(line.strip())
    if current_section:
        sections.append("\n".join(current_section).strip())
    return sections

# Function to generate keywords using Gemma with a sliding window approach
def generate_keywords_with_sliding_window(section, window_size=512, stride=256):
    words = section.split()
    generated_keywords = set()
    
    for start in range(0, len(words), stride):
        window = " ".join(words[start:start+window_size])
        if not window:
            continue
        
        prompt = f"Extract important keywords from the following text:\n{window}\nKeywords:"
        response = keyword_generator(prompt, max_new_tokens=50, truncation=True, max_length=512, do_sample=False)[0]['generated_text']
        
        keywords = re.findall(r'\b[A-Za-z]+\b', response)  # Extract words
        generated_keywords.update(keywords)
    
    return list(generated_keywords)

# Main function to process text
def process_text(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as file:
        text = file.read()
    
    cleaned_text = clean_text(text)
    sections = split_into_sections(cleaned_text)
    
    processed_sections = []
    
    responses = keyword_generator(sections, max_new_tokens=50, truncation=True, max_length=512, do_sample=False)
    
    for sec, response in zip(sections, responses):
        found_keywords = re.findall(r'\b[A-Za-z]+\b', response[0]['generated_text'])
        processed_sections.append({
            'section_text': sec,
            'keywords_found': found_keywords
        })
    
    with open(output_file, "w", encoding="utf-8") as outfile:
        json.dump(processed_sections, outfile, indent=4)
    
    print(f"Processed {len(processed_sections)} sections. Results written to '{output_file}'.")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [6]:
# test_sentence = "Cognitive behavioral therapy is a widely used approach in mental health treatment."


# test_text = "Cognitive behavioral therapy is widely used in mental health treatment."
# print("Extracted Keywords:", extract_keywords(test_text))


In [7]:
# Run processing on Kaggle
process_text("/kaggle/input/extractedbook/ExtractedBook.txt", "processed_text.txt")

print("Processing complete. Download 'processed_text.txt' for results.")


Both `max_new_tokens` (=50) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both

Processed 230 sections. Results written to 'processed_text.txt'.
Processing complete. Download 'processed_text.txt' for results.
