In [1]:
!pip install ollama langchain langgraph pydantic fastapi uvicorn pyngrok nest_asyncio langchain_community -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.8/143.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.9/43.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.5/216.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Import a library needed to run an async server in the notebook
import nest_asyncio

nest_asyncio.apply()

In [3]:
%%bash
# Download the Ollama installation script
curl -fsSL https://ollama.com/install.sh | sh

# Start Ollama as a background process
ollama serve > ollama.log 2>&1 &



>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
#=#=#                                                                         ##O#-#                                                                                                                                                   0.0%                                                                           0.0%                                                                           0.1%                                                                           0.2%                                                                           0.4%                                                                           0.7%                                                                           1.0%#                                                                          1.5%#                                                                          1.9%#                                                 

In [4]:
import ollama

# Pull the llama3 model. This will take a few minutes.
print("Pulling Llama 3 model...")
ollama.pull('llama3:8b')
print("Model pulled successfully!")

# Verify that the model is running
print("\nAvailable models:")
!ollama list

Pulling Llama 3 model...
Model pulled successfully!

Available models:
NAME         ID              SIZE      MODIFIED               
llama3:8b    365c0bd3c000    4.7 GB    Less than a second ago    


In [5]:
%%writefile schemas.py

from pydantic import BaseModel
from typing import List

class ChatRequest(BaseModel):
    """Request model for the chat endpoint."""
    message: str
    history: List[dict] = [] # e.g., [{"role": "user", "content": "hi"}, {"role": "assistant", "content": "hello"}]

class ChatResponse(BaseModel):
    """Response model for the chat endpoint."""
    response: str

Writing schemas.py


In [6]:
%%writefile graph.py

from langchain_community.chat_models import ChatOllama
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate
from langgraph.graph import StateGraph, END
from typing import List, TypedDict

# Define the state for our graph
class GraphState(TypedDict):
    messages: List[HumanMessage | AIMessage]

# Initialize the LLM we'll use
llm = ChatOllama(model="llama3:8b", temperature=0.3)

# Define the function that calls the LLM
def call_model(state: GraphState):
    """Calls the LLM with the current state and returns the updated state."""
    messages = state['messages']
    response = llm.invoke(messages)
    # Append the AI's response to the state
    return {"messages": messages + [response]}

# Define the graph structure
workflow = StateGraph(GraphState)

# Add the single node to our workflow
workflow.add_node("llm", call_model)

# Set the entrypoint and finish point of the graph
workflow.set_entry_point("llm")
workflow.add_edge("llm", END)

# Compile the graph into a runnable object
langgraph_app = workflow.compile()

print("LangGraph compiled successfully!")

Writing graph.py


In [9]:
%%writefile main.py

from fastapi import FastAPI
from schemas import ChatRequest, ChatResponse
from graph import langgraph_app
from langchain_core.messages import HumanMessage, AIMessage

# Initialize the FastAPI app
app = FastAPI(
    title="Local LLM Chat API",
    description="An API to interact with a local Llama 3 model using LangGraph.",
    version="1.0.0",
)

@app.get("/", tags=["Status"])
def read_root():
    """Root endpoint to check API status."""
    return {"status": "API is running!"}

@app.post("/chat", response_model=ChatResponse, tags=["Chat"])
def chat_with_llm(request: ChatRequest):
    """
    Receives a message and chat history, gets a response from the LLM,
    and returns it.
    """
    # Format history into LangChain message objects
    history_messages = []
    for item in request.history:
        if item.get("role") == "user":
            history_messages.append(HumanMessage(content=item["content"]))
        elif item.get("role") == "assistant":
            history_messages.append(AIMessage(content=item["content"]))

    # Add the new user message
    current_messages = history_messages + [HumanMessage(content=request.message)]

    # Invoke the LangGraph with the current conversation
    final_state = langgraph_app.invoke({"messages": current_messages})

    # The final state contains all messages; the last one is the AI's response
    ai_response = final_state['messages'][-1].content

    return ChatResponse(response=ai_response)

Writing main.py


In [11]:
import uvicorn
import threading
from pyngrok import ngrok

# Set your ngrok authtoken (replace with your own if you have one, or get one from ngrok.com)
# This step is optional but recommended to avoid rate limits.
ngrok.set_auth_token("30AnNX72oaBUhs8RcCfG218T8ZR_QnRwTSn6DQNb2CGuFp9R")

# Define the port the app will run on
port = 8000

# Open a tunnel to the port
public_url = ngrok.connect(port).public_url
print(f"🚀 FastAPI app is live at: {public_url}")

# Define a function to run the Uvicorn server
def run_app():
    uvicorn.run("main:app", host="0.0.0.0", port=port, reload=False)

# Run the app in a separate thread to avoid blocking the Colab notebook
thread = threading.Thread(target=run_app)
thread.start()

🚀 FastAPI app is live at: https://b01ce7d866a8.ngrok-free.app


In [13]:
import requests
import json

# api url
api_url = "https://b01ce7d866a8.ngrok-free.app/chat"

# Let's start a conversation
chat_history = []

while True:
    # Get user input
    user_message = input("You: ")
    if user_message.lower() in ["exit", "quit"]:
        print("🤖 Goodbye!")
        break

    # Prepare the request payload
    payload = {
        "message": user_message,
        "history": chat_history
    }

    print("🤖 Thinking...")

    # Send the request to our API
    try:
        response = requests.post(api_url, json=payload)
        response.raise_for_status() # Raise an exception for bad status codes

        # Extract the JSON response
        api_response = response.json()
        ai_message = api_response.get("response")

        print(f"🤖 AI: {ai_message}")

        # Update chat history for context in the next turn
        chat_history.append({"role": "user", "content": user_message})
        chat_history.append({"role": "assistant", "content": ai_message})

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        break

You: Hi how are you?
🤖 Thinking...
INFO:     34.138.222.71:0 - "POST /chat HTTP/1.1" 200 OK
🤖 AI: I'm just an AI, so I don't have feelings or emotions like humans do. However, I'm functioning properly and ready to help with any questions or tasks you may have! It's great that you're reaching out and starting a conversation. How can I assist you today?
You: Tell me about aliens
🤖 Thinking...
An error occurred: HTTPSConnectionPool(host='b01ce7d866a8.ngrok-free.app', port=443): Max retries exceeded with url: /chat (Caused by SSLError(SSLError(1, '[SSL: DECRYPTION_FAILED_OR_BAD_RECORD_MAC] decryption failed or bad record mac (_ssl.c:2590)')))
