In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048  
dtype = (
    None  
)
load_in_4bit = True  


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="deeponh/revel_2",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.18: Fast Mistral patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Unsloth 2025.3.18 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
import pandas as pd
import torch
import re  
from datasets import Dataset, load_dataset
from unsloth import FastLanguageModel
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')


def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

    words = text.split()[:250]

    cleaned_text = ' '.join(words).strip()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    return cleaned_text

def prepare_compscholar_data(df):
    data = []
    for _, row in df.iterrows():
        document = clean_text(row['Document'])
        summary = clean_text(row['Summary']) 
        data.append({"document": document, "summary": summary})
    return data

compscholar_df = pd.read_csv("CompScholar.csv")
all_data = prepare_compscholar_data(compscholar_df)

dataset = Dataset.from_dict({
    "document": [item["document"] for item in all_data],
    "summary": [item["summary"] for item in all_data]
})

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
dataset.shape

(371, 2)

In [5]:
dataset

Dataset({
    features: ['document', 'summary'],
    num_rows: 371
})

In [7]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
dataset = dataset.filter(lambda x: len(x["document"].split()) <= 500)

Filter:   0%|          | 0/371 [00:00<?, ? examples/s]

In [9]:
def formatting_prompts_func(examples):
    instructions = ["Analyze the research article content and get me a summary from the research article. The summary length has to be be within 150 words. Ensure that the summary is well-structured and provides a clear understanding of the paper's purpose and outcomes without unnecessary details."] * len(examples["document"])

    prompts, sums = [], []
    for instruction, doc, sum in zip(instructions, examples["document"], examples["summary"]):
        prompt = f"<s>[INST] {instruction}\n [/INST] {doc}</s>"
        sums.append(sum)
        prompts.append(prompt)

    return {"inputs": prompts, "targets": sums}

tokenized_dataset = dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/371 [00:00<?, ? examples/s]

In [10]:
tokenized_dataset[6]

{'document': 'A DATA MINING APPROACH FOR PREDICTION OF HEART DISEASE USING NEURAL NETWORKSBackpropagation Data mining Heart disease Multilayer perceptron neural network Neural Network Heart disease diagnosis is a complex task which requires much experience and knowledge Traditional way of predicting Heart disease is doctor s examination or number of medical tests such as ECG Stress Test and Heart MRI etc Nowadays Health care industry contains huge amount of heath care data which contains hidden information This hidden information is useful for making effective decisions Computer based information along with advanced Data mining techniques are used for appropriate results Neural network is widely used tool for predicting Heart disease diagnosis In this research paper a Heart Disease Prediction system HDPS is developed using Neural network The HDPS system predicts the likelihood of patient getting a Heart disease For prediction the system uses sex blood pressure cholesterol like 13 medic

In [11]:
import tqdm

In [13]:
predictions = []

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction and Input:
{}

### Response:
{}"""

for example in tqdm.tqdm(tokenized_dataset):
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                example["inputs"],  
                "",  
            )
        ],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
    ).to("cuda")

    input_length = inputs.input_ids.shape[-1]

    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=600,
        use_cache=True,
        temperature=1.0,
        min_p=0.1,
    )
    response_only = tokenizer.batch_decode(outputs[:, input_length:], skip_special_tokens=True)[0]

    predictions.append(response_only)

tokenized_dataset = tokenized_dataset.add_column("predictions", predictions)


100%|██████████| 371/371 [1:43:59<00:00, 16.82s/it]


In [14]:
tokenized_dataset.to_csv("test_predictions.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

2341832