In [None]:
!pip install litellm

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
judge_prompt="""
Please act as an impartial judge and evaluate the quality of the response provided by two AI assistants to the input prompt. The responses should reflect
knowledge of KNOWLEDGE SOURCE demonstrating specific and knowledgeable
insights from CONTEXT about the query. Avoid positional Biasness. Just declare
which response is better and provide one statement why. User's Query: {prompt}
Assistant A Response: {base_model_response} 
Assistant B Response: {finetuned_model_response} 

You should choose the assistant that produces a better generation. Avoid positional biases and ensure that the order in which the
responses were presented does not influence your decision. Be as objective as possible. After providing your explanation, output your final verdict strictly following
this format: [[A]] if assistant A is better, [[B]] if assistant B is better, and [[C]] for a tie.
"""

In [5]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv('GEMINI_API_KEY')
print(api_key)

None


In [None]:
from litellm import completion

# Use thinking_level for Gemini 3 models
resp = completion(
    model="gemini/gemini-3-pro-preview",
    messages=[{"role": "user", "content": "Solve this complex math problem step by step."}],
    reasoning_effort="high",  # Options: "low" or "high"
)

# Low thinking level for faster, simpler tasks
resp = completion(
    model="gemini/gemini-3-pro-preview",
    messages=[{"role": "user", "content": "What is the weather today?"}],
    reasoning_effort="low",  # Minimizes latency and cost
)




In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.3-70B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "YOUR_HF_TOKEN", # HF Token for gated models
)

## Connect to Google Colab with T4 GPU

This section sets up a connection to Google Colab with T4 GPU access.

In [None]:
# Step 1: Install required dependencies
import subprocess
import sys

def install_package(package):
    """Install a package if not already installed"""
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

# Install required packages
packages = [
    "pyngrok",
    "google-colab",
    "torch",
    "torchvision",
]

print("Installing required packages...")
for package in packages:
    try:
        install_package(package)
        print(f"✓ {package} installed")
    except Exception as e:
        print(f"✗ {package} failed: {e}")

print("\nPackages installation complete!")

In [None]:
# Step 2: Setup ngrok authentication
from pyngrok import ngrok
import getpass

print("=" * 60)
print("NGROK SETUP")
print("=" * 60)
print("\n1. Create a free account at https://ngrok.com")
print("2. Get your authentication token from: https://dashboard.ngrok.com/get-started/your-authtoken")
print("\nPaste your ngrok auth token below:")

ngrok_auth_token = getpass.getpass("ngrok auth token: ")

if ngrok_auth_token:
    ngrok.set_auth_token(ngrok_auth_token)
    print("✓ ngrok authentication configured")
else:
    print("✗ ngrok token not provided")

In [None]:
# Step 3: Check GPU availability
import torch

print("=" * 60)
print("GPU AVAILABILITY")
print("=" * 60)

gpu_available = torch.cuda.is_available()
print(f"\nGPU Available: {gpu_available}")

if gpu_available:
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("\n⚠ No GPU detected. Please run this in Google Colab with GPU enabled:")
    print("  - Go to Runtime → Change runtime type → Hardware accelerator → GPU (T4)")
    print("  - This script is designed to be executed in Colab with T4 GPU")

In [None]:
# Step 4: Start Jupyter server and create ngrok tunnel
import os
import json
from datetime import datetime

print("=" * 60)
print("COLAB JUPYTER CONNECTION SETUP")
print("=" * 60)

# Get Jupyter token (if running in Colab, generate one)
JUPYTER_TOKEN = "colab_session_" + datetime.now().strftime("%Y%m%d_%H%M%S")

print(f"\nJupyter Token: {JUPYTER_TOKEN}")
print("\nStarting ngrok tunnel...")

try:
    # Create tunnel to localhost:8888 (default Jupyter port)
    public_url = ngrok.connect(8888, "http")
    print(f"✓ Tunnel created successfully!")
    print(f"\nPublic URL: {public_url}")
    
    # Extract ngrok URL
    tunnel_url = public_url.public_url
    
    print(f"\n" + "=" * 60)
    print("CONNECTION DETAILS")
    print("=" * 60)
    print(f"\nAccess URL: {tunnel_url}/?token={JUPYTER_TOKEN}")
    print(f"\nUse this link to connect to your Colab notebook from your local machine")
    print(f"Replace 'localhost:8888' with this public URL in your local browser")
    
    # Save connection info for reference
    connection_info = {
        "public_url": tunnel_url,
        "token": JUPYTER_TOKEN,
        "timestamp": datetime.now().isoformat(),
        "full_url": f"{tunnel_url}/?token={JUPYTER_TOKEN}"
    }
    
    print(f"\n" + json.dumps(connection_info, indent=2))
    
except Exception as e:
    print(f"✗ Error creating tunnel: {e}")
    print("Make sure you've configured ngrok auth token in the previous cell")

## How to Run This Script

### Prerequisites
1. **Google Colab Account** - Free GPU access at https://colab.research.google.com
2. **ngrok Account** - Free tunnel service at https://ngrok.com
3. **ngrok Auth Token** - Get it from https://dashboard.ngrok.com/get-started/your-authtoken

### Steps
1. Copy this code to a Google Colab notebook
2. Run cells in order: Install → ngrok Auth → GPU Check → Connect
3. Once tunnel is created, you'll get a public URL
4. Open that URL in your browser to access Jupyter with T4 GPU

### Alternative: Direct Colab Approach
If you just want to use Colab's T4 GPU directly:
- Go to https://colab.research.google.com
- New notebook → Runtime → Change runtime type → GPU (T4)
- Upload your notebook or write code directly
- No tunnel needed!

### Notes
- Keep the tunnel running while using the connection
- T4 GPU has 16GB VRAM (good for most LLMs)
- Colab sessions timeout after 12 hours of inactivity
- Free tier has usage limits

In [None]:
# Quick Utility: Test Connection Status
import socketserver
import threading

def test_gpu_and_connection():
    """Test GPU access and connection status"""
    
    print("=" * 60)
    print("CONNECTION STATUS CHECK")
    print("=" * 60)
    
    # Check GPU
    try:
        import torch
        gpu_status = "✓ GPU Available" if torch.cuda.is_available() else "✗ No GPU"
        print(f"\nGPU Status: {gpu_status}")
        if torch.cuda.is_available():
            print(f"Device: {torch.cuda.get_device_name(0)}")
    except Exception as e:
        print(f"GPU Check Error: {e}")
    
    # Check ngrok
    try:
        from pyngrok import ngrok
        tunnels = ngrok.get_tunnels()
        print(f"\nActive Tunnels: {len(tunnels)}")
        for tunnel in tunnels:
            print(f"  - {tunnel.public_url} → {tunnel.config.addr}")
    except Exception as e:
        print(f"Tunnel Check: No active tunnel (run Step 4 first)")
    
    print("\n" + "=" * 60)

# Run the test
test_gpu_and_connection()