In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.8.5-py3-none-any.whl.metadata (47 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/47.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.8.4 (from unsloth)
  Downloading unsloth_zoo-2025.8.4-py3-none-any.whl.metadata (9.4 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.28-py3-none-any.whl.metadata (11 kB)
Collecting datasets<4.0.0,>=3.4.1 (from unsloth)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.19.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth)
  Do

In [3]:
from unsloth import FastLanguageModel
import torch

# Define variables used in the original model loading
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# Load the model from the local path
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/content/drive/MyDrive/turkishLLMCompetition/LastModel", # Load from the local path
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.5: Fast Gemma2 patching. Transformers: 4.55.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/7.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/7.86G [00:00<?, ?B/s]

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
%pip install gradio

## Define prediction function

### Subtask:
Create a Python function that takes user input, formats it as a prompt, uses the model to generate a response, and returns the response.


**Reasoning**:
Define a Python function to handle user input, format it as a prompt, generate a response using the trained model, and return the extracted response.



In [None]:
from unsloth import FastLanguageModel
import torch
import gradio as gr

# Model config
max_seq_length = 2048
dtype = None
load_in_4bit = True
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model from local path
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/content/drive/MyDrive/turkishLLMCompetition/LastModel",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Enable faster inference once
FastLanguageModel.for_inference(model)

# Alpaca prompt template
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
"""

In [None]:


import traceback

def predict(message, history):
    formatted_input = alpaca_prompt.format(" ", message, "")
    inputs = tokenizer([formatted_input], return_tensors="pt").to(device)

    output_tokens = model.generate(
        **inputs,
        max_new_tokens=256,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    response_start_marker = "### Response:"
    idx = generated_text.find(response_start_marker)

    if idx != -1:
        reply = generated_text[idx + len(response_start_marker):].strip()
    else:
        reply = generated_text

    history.append((message, reply))
    return reply, history



# Gradio chat app
gr.ChatInterface(
    fn=predict,
    title="🩺 Meditron Medical Chat",
    textbox=gr.Textbox(placeholder="Sorunuzu yazın...")
).launch(share=True)


## Create gradio interface

### Subtask:
Use the Gradio library to build a simple web interface that takes text input and displays the model's output using the prediction function.


**Reasoning**:
Create a Gradio interface using the defined predict function.



In [8]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
"""


In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "kadin-hastaliklari-ve-dogum", # instruction
        "merhaba doktor bey, eşimle bebek sahibi olmak istiyoruz 2 aydır korunmuyoruz.bu arada vajinal kuruluğum olduğu için bebe yağı kullanıyorduk sanırım spermleri öldürüomuş bebe yağı pressed adında bi jeli gördüm netten önerir misiniz? ya da bu konuda ne önerir siniz? teşekkürler", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256)

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        " ", # instruction
        "mrb hocam ben panik atak hastasıyım hep vücudumu dinlemekle meşgulum midemde yanma geğirti var birde mide ve karın bölgemde kalp atışı gibi bişey var dışarıdan belli oluyor atışı özlellikle heyacanlandığımda sinirlendiğimde daha hızlı atıyor ne olabilir bu bir hastalık belirtisimi yoksa stresten mi oluyor ne yapacağımı bilmiyorum lütfen bana yardımcı olun", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        " ", # instruction
        "Ayşe Hocam merhaba, Şuan 23 aylık olan kızıma yaklaşık bir yıl önce epilepsi tedavisi uygulanmaya başlandı. Luminetten adlı ilacı kullanıyoruz. 14 aylıkken yeniden geçirdi ve size danışarak ilaç arttırımına gittik. dirençli bir enfeksiyon döneminde son nöbetinden 9 ay sonra yeniden nöbet geçirmeye başladı. Nöbetler uyku esnasında oluyor ve en uzunu 10 sn sürüyor. Kilosu10.5 şuan sabah 1.5 akşam 1.5 luminetten kullanıyoruz. Görmeniz gerekir mi? Kanındaki fenobarbital verildikten 4saat sonra 31.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

In [15]:
import gradio as gr
# Gradio interface
gr.ChatInterface(
    fn=predict,
    title="🩺 Meditron Medical Chat",
    chatbot=gr.Chatbot(height=500),
    textbox=gr.Textbox(placeholder="Sorunuzu yazın..."),
).launch(share=True)

  chatbot=gr.Chatbot(height=500),


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://57f6ebb1b8449f7fc8.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [13]:
import gradio as gr

iface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(label="Enter your query"),
    outputs=gr.Textbox(label="Model Response"),
    title="Medical QA Model"
)

iface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c0dbd8d06d667587d1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




## Launch gradio interface

### Subtask:
Run the Gradio application to make the interface available.


**Reasoning**:
Launch the Gradio interface with the share parameter set to True to make it publicly accessible.



In [7]:
iface.launch(share=True)

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ad033c8bcbdb4b1b42.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




## Summary:

### Data Analysis Key Findings

*   The `gradio` library was already installed in the environment, indicating that the necessary tools for building the web interface were available from the start.
*   A Python function `predict` was successfully defined to handle user input, format it for the model, generate a response, and extract the relevant response text.
*   A Gradio interface was successfully created, linking the `predict` function to text input and output fields, and assigned the title "Medical QA Model".
*   The Gradio interface was successfully launched with `share=True`, making it publicly accessible via a generated URL.

### Insights or Next Steps

*   The current setup provides a functional, temporary interface for the model. For permanent hosting, consider deploying the application to platforms like Hugging Face Spaces as suggested in the launch output.
*   Further refinement of the `predict` function could include error handling for unexpected model outputs or edge cases in extracting the response.
