# League of Legends Narrative Generator
### Henry Hu, Suhho Lee, Victor Wei

## 1. Data Preprocessing

In [None]:
import time
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from tqdm import tqdm
import random
import re
import traceback

class LoLChampionScraper:
    def __init__(self):
        self.champions_url = "https://universe.leagueoflegends.com/en_US/champions/"
        self.base_url = "https://universe.leagueoflegends.com"
        self.champions_data = []

        self.chrome_options = Options()
        self.chrome_options.add_argument("--window-size=1920,1080")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
        self.chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36") # Example user agent
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.chrome_options)
        print("WebDriver Initialized")

    def extract_champions_list(self):
        """Extract list of champions using Selenium"""
        self.driver.get(self.champions_url)
        time.sleep(5)
        selectors = [
            "li.item_30l8 a",
            ".champsListUl_2Lmb li a",
            "a[href*='/champion/']"
        ]
        champions = []
        timeout = 10
        for selector in selectors:
            try:
                WebDriverWait(self.driver, timeout).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                champion_elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                if champion_elements:
                    for element in champion_elements:
                        try:
                            url = element.get_attribute("href")
                            if not url or not url.startswith(self.base_url):
                                continue
                            name_element = element.find_element(By.CSS_SELECTOR, "h1") if element.find_elements(By.CSS_SELECTOR, "h1") else None
                            region_element = element.find_element(By.CSS_SELECTOR, "h2") if element.find_elements(By.CSS_SELECTOR, "h2") else None
                            name = name_element.text.strip() if name_element and name_element.text else ""
                            region = region_element.text.strip() if region_element and region_element.text else ""
                            if name and url and not any(c['name'] == name.upper() for c in champions):
                                champions.append({'name': name.upper(), 'region': region, 'url': url})
                        except Exception as e: print(f"  Warn: Error processing a champion list element: {e}")
                    if champions:
                        print(f"  Successfully extracted champion list using selector: {selector}")
                        break
            except TimeoutException: print(f"  Selector {selector} timed out.")
            except Exception as e: print(f"  Selector {selector} failed with error: {e}")
        print(f"Found {len(champions)} unique champions")
        return champions

    def extract_champion_details(self, champion_data):
        """Extract detailed information for a specific champion's main page"""
        print(f"Extracting details for {champion_data['name']}...")
        if not champion_data.get('url'):
            print(f"  Error: Missing URL for {champion_data['name']}")
            return champion_data
        try:
            self.driver.get(champion_data['url'])
            WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
            time.sleep(1 + random.random())
        except Exception as e:
            print(f"  Error navigating to champion page {champion_data['url']}: {e}")
            return champion_data

        # Extract Role, Race, Quote, Short Bio
        try:
            role_element = WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".typeDescription_ixWu h6, .playerType_3laO h6")))
            champion_data['role'] = role_element.text.strip()
        except: champion_data['role'] = ""
        try:
            race_elements = self.driver.find_elements(By.CSS_SELECTOR, ".ChampionRace_a_Fp h6, .race_3k58 h6")
            champion_data['race'] = race_elements[0].text.strip() if race_elements else ""
        except: champion_data['race'] = ""
        try:
            quote_elements = self.driver.find_elements(By.CSS_SELECTOR, ".quote_2507 p, .championQuotes_3FLE p")
            champion_data['quote'] = quote_elements[0].text.strip() if quote_elements else ""
        except: champion_data['quote'] = ""
        
        # IMPROVED: Extract short bio with multiple selectors and approaches
        try:
            # First try the original approach
            bio_elements = self.driver.find_elements(By.CSS_SELECTOR, ".biographyText_3-to p, .biography_3YIe p")
            if bio_elements and bio_elements[0].text.strip():
                champion_data['short_bio'] = bio_elements[0].text.strip()
            else:
                # Try getting the text from the div container directly
                bio_containers = self.driver.find_elements(By.CSS_SELECTOR, ".biographyText_3-to, .biography_3YIe")
                if bio_containers:
                    container_text = bio_containers[0].text.strip()
                    if container_text:
                        # Split by newlines and take the first paragraph if multiple exist
                        paragraphs = [p.strip() for p in container_text.split('\n') if p.strip()]
                        if paragraphs:
                            champion_data['short_bio'] = paragraphs[0]
                        else:
                            champion_data['short_bio'] = container_text
                    else:
                        champion_data['short_bio'] = ""
                else:
                    champion_data['short_bio'] = ""
        except Exception as e:
            print(f"  Warn: Error extracting short bio: {e}")
            champion_data['short_bio'] = ""

        # Extract Related Champions
        related_champions = []
        try:
            h5_locator = (By.CSS_SELECTOR, "ul.champions_jmhN li.champion_1xlO h5")
            WebDriverWait(self.driver, 3).until(EC.presence_of_element_located(h5_locator))
            related_elements = self.driver.find_elements(*h5_locator)
            if related_elements:
                for i, elem in enumerate(related_elements):
                    try:
                        champion_name = self.driver.execute_script("return arguments[0].textContent;", elem).strip()
                        if champion_name and champion_name not in related_champions:
                            related_champions.append(champion_name)
                    except Exception as inner_e: 
                        print(f"    Warn: Error processing related champion element {i+1}: {type(inner_e).__name__} - {inner_e}")
        except Exception as e: 
            print(f"  Warn: An unexpected error occurred while finding/processing related champions: {type(e).__name__} - {e}")
        champion_data['related_champions'] = related_champions
        print(f"  Assigned related champions list: {champion_data['related_champions']}")

        # Find Biography URL
        try:
            bio_link_elements = self.driver.find_elements(By.XPATH, "//a[.//button[.//span[contains(text(), 'Read Biography') or contains(text(), 'Read Bio')]]]|//a[contains(@href,'/story/champion/')]")
            found_bio_url = ""
            if bio_link_elements:
                for link_el in bio_link_elements:
                    href = link_el.get_attribute('href')
                    if href and '/story/champion/' in href: found_bio_url = href; break
                if not found_bio_url: found_bio_url = bio_link_elements[0].get_attribute('href')
            if found_bio_url: champion_data['bio_url'] = found_bio_url
            else:
                clean_name = re.sub(r'[^a-z0-9]', '', champion_data['name'].lower()); bio_url = f"{self.base_url}/en_US/story/champion/{clean_name}/"; champion_data['bio_url'] = bio_url; print(f"  Warn: Could not find bio button/link, constructed fallback URL: {bio_url}")
        except Exception as e: 
            print(f"  Warn: Could not find or construct biography URL: {e}"); champion_data['bio_url'] = ""
        champion_data['story_url'] = ""
        return champion_data

    def extract_page_content(self, container_selector, paragraph_selector):
        """Helper function to extract joined paragraph text from a container."""
        full_text = ""
        paragraphs_count = 0
        try:
            print(f"  Attempting to find container '{container_selector}' directly in DOM...")
            container_elements = self.driver.find_elements(By.CSS_SELECTOR, container_selector)

            if not container_elements:
                print(f"  Error: Container '{container_selector}' not found in DOM after interaction attempt.")
                return full_text, paragraphs_count

            container_element = container_elements[0]
            print(f"  Container '{container_selector}' found in DOM.")

            paragraphs = container_element.find_elements(By.CSS_SELECTOR, paragraph_selector)
            paragraphs_count = len(paragraphs)
            if paragraphs:
                extracted_texts = []
                for i, p in enumerate(paragraphs):
                    try:
                        para_text = self.driver.execute_script(
                            "return arguments[0].textContent;", p
                        ).strip()
                        if para_text:
                            extracted_texts.append(para_text)
                    except Exception as inner_e:
                        print(f"    Warn: Error processing paragraph {i+1}: {type(inner_e).__name__} - {inner_e}")

                full_text = "\n\n".join(extracted_texts)
                if not full_text and paragraphs_count > 0:
                    print(f"  Warn: Found {paragraphs_count} paragraphs in '{container_selector}', but all textContent was empty after processing.")
            else:
                print(f"  Warn: Container '{container_selector}' found, but no paragraphs matched selector '{paragraph_selector}'.")

        except Exception as e:
            print(f"  Error: Exception finding/processing content within '{container_selector}': {type(e).__name__} - {e}")

        return full_text, paragraphs_count

    def extract_bio_and_story(self, champion_data):
        """Extract full biography from bio_url and find the story_url."""
        champion_data['full_biography'] = ""
        if not champion_data.get('bio_url'):
            print(f"  Info: No biography URL available for {champion_data['name']}")
            return champion_data

        print(f"Navigating to biography page for {champion_data['name']}...")
        try:
            self.driver.get(champion_data['bio_url'])
            WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
            time.sleep(1 + random.random())
        except Exception as nav_e:
            print(f"  Error navigating to biography URL '{champion_data['bio_url']}': {nav_e}")
            return champion_data
        
        clicked_scroll_button = False
        try:
            button_selector = (By.CSS_SELECTOR, "p.cta_VVdh")
            scroll_button = WebDriverWait(self.driver, 7).until(
                EC.presence_of_element_located(button_selector)
            )
            print("  'Scroll to Begin' button (p.cta_VVdh) is present.")
            try:
                self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', inline: 'nearest'});", scroll_button)
                time.sleep(1.0)
                self.driver.execute_script("arguments[0].click();", scroll_button)
                print("  Attempted click 'Scroll to Begin' button via JavaScript.")
                clicked_scroll_button = True
                print("  Performing small scroll down after click...")
                self.driver.execute_script("window.scrollBy(0, 150);")
                time.sleep(0.5)
            except Exception as js_click_e:
                print(f"  Warn: JavaScript click execution failed: {type(js_click_e).__name__} - {js_click_e}")
        except TimeoutException:
            print("  Info: 'Scroll to Begin' button (p.cta_VVdh) not found within timeout.")
        except Exception as scroll_e:
            print(f"  Warn: Error interacting with 'Scroll to Begin' button: {type(scroll_e).__name__} - {scroll_e}")

        primary_paragraph_selector = "p.p_1_sJ"
        container_selector = "#CatchElement"
        bio_text, para_count = self.extract_page_content(container_selector, primary_paragraph_selector)
        champion_data['full_biography'] = bio_text

        if bio_text:
            actual_paragraphs = len(bio_text.split('\n\n'))
            print(f"  Extracted biography text ({actual_paragraphs} non-empty paragraphs joined).")
        elif not bio_text:
            print(f"  Warn: Failed to extract biography text content from '{container_selector}'.")

        story_button_found = False
        try:
            story_links = self.driver.find_elements(By.XPATH, 
                "//a[.//button[.//span[contains(text(), 'story') or contains(text(), 'Story')]]]|" +
                "//a[contains(@href,'/story/')][not(contains(@href, '/story/champion/'))]|" +
                "//a[contains(@href,'-color-story')]"
            )
            
            found_story_url = ""
            if story_links:
                for link in story_links:
                    href = link.get_attribute('href')
                    if href and ('/story/' in href) and ('/story/champion/' not in href):
                        found_story_url = href
                        story_button_found = True
                        break
                
                if not found_story_url and story_links:
                    found_story_url = story_links[0].get_attribute('href')
                    story_button_found = True
            
            if found_story_url:
                champion_data['story_url'] = found_story_url
                print(f"  Found story URL on bio page: {champion_data['story_url']}")
            else:
                print(f"  No story link found on bio page.")
                champion_data['story_url'] = ""
        except Exception as e:
            print(f"  Warn: Error finding story link on bio page: {e}")
            champion_data['story_url'] = ""
            
        if not story_button_found and not champion_data['story_url']:
            try:
                clean_name = re.sub(r'[^a-z0-9]', '', champion_data['name'].lower())
                fallback_url = f"{self.base_url}/en_US/story/{clean_name}-color-story/"
                print(f"  Creating fallback story URL: {fallback_url}")
                champion_data['story_url'] = fallback_url
            except Exception as fallback_e:
                print(f"  Error creating fallback story URL: {fallback_e}")
                champion_data['story_url'] = ""
                
        return champion_data

    def extract_story_content(self, champion_data):
        """Extract the full story content from the story URL"""
        champion_data['full_story'] = ""
        if not champion_data.get('story_url') or not champion_data['story_url'].startswith(self.base_url):
            print(f"  Info: No valid story URL available for {champion_data['name']}")
            return champion_data

        print(f"Navigating to story page for {champion_data['name']}...")
        try:
            self.driver.get(champion_data['story_url'])
            WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
            time.sleep(1 + random.random())
        except Exception as nav_e:
            print(f"  Error navigating to story URL '{champion_data['story_url']}': {nav_e}")
            return champion_data

        clicked_scroll_button = False
        try:
            button_selector = (By.CSS_SELECTOR, "p.cta_VVdh")
            scroll_button = WebDriverWait(self.driver, 7).until(
                EC.presence_of_element_located(button_selector)
            )
            print("  'Scroll to Begin' button (p.cta_VVdh) is present.")
            try:
                self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', inline: 'nearest'});", scroll_button)
                time.sleep(1.0)
                self.driver.execute_script("arguments[0].click();", scroll_button)
                print("  Attempted click 'Scroll to Begin' button via JavaScript.")
                clicked_scroll_button = True
                print("  Performing small scroll down after click...")
                self.driver.execute_script("window.scrollBy(0, 150);")
                time.sleep(0.5)
            except Exception as js_click_e:
                print(f"  Warn: JavaScript click execution failed: {type(js_click_e).__name__} - {js_click_e}")
        except TimeoutException:
            print("  Info: 'Scroll to Begin' button (p.cta_VVdh) not found within timeout.")
        except Exception as scroll_e:
            print(f"  Warn: Error interacting with 'Scroll to Begin' button: {type(scroll_e).__name__} - {scroll_e}")

        primary_paragraph_selector = "p.p_1_sJ"
        container_selector = "#CatchElement"
        story_text, para_count = self.extract_page_content(container_selector, primary_paragraph_selector)
        champion_data['full_story'] = story_text

        if story_text:
            actual_paragraphs = len(story_text.split('\n\n'))
            print(f"  Extracted story text ({actual_paragraphs} non-empty paragraphs joined).")
        elif not story_text:
            print(f"  Warn: Failed to extract story text content from '{container_selector}'.")

        return champion_data

    def scrape_champions(self, limit=None):
        """Scrape information for all champions"""
        all_data = []
        try:
            champions_list = self.extract_champions_list()
            if not champions_list:
                print("Error: Failed to extract champions list. Exiting.")
                return []
            if limit:
                champions_list = champions_list[:limit]
            self.champions_data = []

            for champion in tqdm(champions_list, desc="Scraping champions"):
                time.sleep(1.5 + random.random() * 2)
                current_champion_data = {'name': champion['name'], 'url': champion['url'], 'region': champion.get('region','')}
                current_champion_data = self.extract_champion_details(current_champion_data)
                current_champion_data = self.extract_bio_and_story(current_champion_data)
                current_champion_data = self.extract_story_content(current_champion_data)
                all_data.append(current_champion_data)
                self.champions_data = all_data
                self.save_to_json(self.champions_data, 'data/progress_champions_data.json') # Pass data

            print(f"\nScraping complete. Processed {len(all_data)} champions.")
            return all_data
        except KeyboardInterrupt:
            print("\nScraping interrupted by user.")
            return all_data
        except Exception as e:
            print(f"\nAn critical error occurred during scraping: {type(e).__name__} - {e}")
            traceback.print_exc()
            return all_data
        finally:
            print("Closing WebDriver...")
            if hasattr(self, 'driver'):
                self.driver.quit()

    def save_to_csv(self, data_to_save, filename='data/lol_champions_data.csv'):
        """Save the collected data to a CSV file"""
        if not data_to_save:
            print("No champion data provided to save to CSV.")
            return
        try:
            df = pd.DataFrame(data_to_save)
            cols = ['name', 'region', 'role', 'race', 'quote', 'related_champions',
                    'short_bio', 'full_biography', 'full_story', 'url', 'bio_url', 'story_url']
            if 'related_champions' in df.columns:
                df['related_champions'] = df['related_champions'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
            df = df.reindex(columns=[col for col in cols if col in df.columns])
            df.to_csv(filename, index=False, encoding='utf-8')
            print(f"Data saved to {filename}")
        except Exception as e:
            print(f"Error saving data to CSV {filename}: {e}")

    def save_to_json(self, data_to_save, filename='data/lol_champions_data.json'):
        """Save the collected data to a JSON file"""
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(data_to_save, f, ensure_ascii=False, indent=4)
            if 'progress' not in filename: print(f"Data saved to {filename}")
            else: pass
        except Exception as e:
            print(f"Error saving data to JSON {filename}: {e}")

scraper = LoLChampionScraper()
final_champion_data = None
try:
    final_champion_data = scraper.scrape_champions()
finally:
    if final_champion_data:
        print("\nSaving final data...")
        scraper.save_to_csv(final_champion_data, 'data/lol_champions_data.csv')
        scraper.save_to_json(final_champion_data, 'data/lol_champions_data.json')
    else:
        if scraper.champions_data:
            print("\nScraping did not complete fully, saving data collected so far...")
            scraper.save_to_csv(scraper.champions_data, 'data/lol_champions_data.csv')
            scraper.save_to_json(scraper.champions_data, 'data/lol_champions_data.json')
        else:
            print("\nNo final data collected to save.")
    if hasattr(scraper, 'driver') and scraper.driver:
        scraper.driver.quit()
        print("WebDriver quit confirmed from main.")

Scraping champions: 100%|██████████| 170/170 [19:43<00:00,  6.96s/champion]


## 2. Getting the Model (GPT-2)

In [1]:
import os
import shutil
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
# Configuration
GPT2_ID    = "openai-community/gpt2-xl"
GPT2_LOCAL = "models/gpt2-xl"

def reset_dir(path):
    if os.path.exists(path):
        print(f"Removing existing directory: {path}/")
        shutil.rmtree(path)
    os.makedirs(path, exist_ok=True)

if __name__ == "__main__":
    reset_dir(GPT2_LOCAL)

    print(f"Downloading GPT-2 XL tokenizer for {GPT2_ID} …")
    tokenizer = AutoTokenizer.from_pretrained(GPT2_ID)
    tokenizer.save_pretrained(GPT2_LOCAL)
    print(f"✅ GPT-2 XL tokenizer saved to {GPT2_LOCAL}/\n")

    print(f"Downloading GPT-2 XL model for {GPT2_ID} …")
    model = AutoModelForCausalLM.from_pretrained(GPT2_ID)
    model.save_pretrained(GPT2_LOCAL)
    print(f"✅ GPT-2 XL model saved to {GPT2_LOCAL}/\n")

    print("GPT-2 XL download complete.")

Downloading GPT-2 XL tokenizer for openai-community/gpt2-xl …


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

✅ GPT-2 XL tokenizer saved to models/gpt2-xl/

Downloading GPT-2 XL model for openai-community/gpt2-xl …


model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ GPT-2 XL model saved to models/gpt2-xl/

GPT-2 XL download complete.


## 3. Generation with the Original Model

In [18]:
import torch
import modal

In [19]:
# Load the original model
original_model_path = "models/gpt2-xl"
tokenizer = AutoTokenizer.from_pretrained(original_model_path)
model = AutoModelForCausalLM.from_pretrained(original_model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
# Set the device and turn on evaluation mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=4800, nx=1600)
          (c_proj): Conv1D(nf=1600, nx=1600)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=6400, nx=1600)
          (c_proj): Conv1D(nf=1600, nx=6400)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
)

In [21]:
import modal

# An image based on the official CUDA‑PyTorch container + extras
image = (
    modal.Image.from_registry("nvcr.io/nvidia/pytorch:24.02-py3")  # CUDA 12 + PyTorch 2.2
    .pip_install(
        "transformers==4.40.1",
        "sentencepiece",         
        "accelerate",           
    )
)
app = modal.App("gpt2-xl")

In [22]:
@app.function(
    image=image,        
    gpu="H100",
    timeout=300
)
def generate_text(prompt: str):
    from transformers import AutoTokenizer, AutoModelForCausalLM
    import torch

    device = "cuda" if torch.cuda.is_available() else "cpu"

    # load from HF hub (fast, streamed) — no need to copy local weights
    model_id = "gpt2-xl"
    tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
    model.eval()

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    output = model.generate(
        input_ids,
        max_length=200,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.9,
        eos_token_id=tokenizer.eos_token_id,
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [23]:
with app.run():      
    story = generate_text.remote(
        "Once upon a time in the world of League of Legends,"
    )
    print(story)

Once upon a time in the world of League of Legends, the game's community thought it was awesome that one of the game's most popular champions, Jayce, had a very short history, as well as a very long, rich one.

The game's lore went from a simple one-line saying to a detailed one-line summary of Jayce's origins and development as a character over the course of a decade and a half, and it's only gotten bigger since that time. While there are a lot of "what if" questions about how Jayce's history and development would have played out, this article will explore what might have happened if Jayce was the first champion we played.

Jayce's beginnings

At the beginning of League of Legends, Jayce was a support mage. He was played by the player known only as TheOddOne, and his origins were outlined in this guide, which you can read for a lot of the background information on


In [30]:
with app.run():          # or `with app.run():` if in same file
    story = generate_text.remote(
        "Once upon a time in the world of League of Legends, on the land of Runeterra, there lived a champion named"
    )
    print(story)

Once upon a time in the world of League of Legends, on the land of Runeterra, there lived a champion named Zed. Zed was not an ordinary champion. Zed was a living demon, an unstoppable force of chaos and destruction. He was also a powerful, terrifying human, and he had a knack for getting along with almost everyone he met. Zed and his team were members of the League of Legends, the elite fighting force for which Runeterra had become famous. Zed was a deadly soldier, a master of the assassin's role, and a champion who had earned his place as a legend. And while Zed himself wasn't an ordinary champion, he was an ordinary assassin, and a very ordinary one at that.

The first time Zed saw Elise, she was standing by the side of the river, tending to her garden. Zed had come across her in the jungle, and he had quickly come to her aid as she was being attacked by a nearby camp. Elise, naturally


## 4. Fine-tuning the Model

In [1]:
from pathlib import Path
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

In [2]:
data_dir = Path("data")
parquet_path = data_dir / "lol_champions_data.parquet"

df = pd.read_parquet(parquet_path)
df["text_to_embed"] = (
    df["name"].fillna("")     + " — " +
    df["role"].fillna("")     + "\n" +
    df["race"].fillna("")     + "\n" +
    df["short_bio"].fillna("")+ "\n" +
    df["full_story"].fillna("")
)
texts = df["text_to_embed"].tolist()

In [8]:
tokenizer = AutoTokenizer.from_pretrained(
    "models/gpt2-xl"
)
tokenizer.pad_token = tokenizer.eos_token

ds = Dataset.from_dict({"text": texts})

# GPT-2's max context length is 1024
context_length = 1024

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=context_length,
        padding = "max_length"
    )

ds_tok = ds.map(
    tokenize, batched=True, remove_columns=["text"]
)
ds_tok.set_format(
    type="torch", columns=["input_ids", "attention_mask"]
)
ds_tok

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 170
})

In [25]:
# Save the tokenized dataset to disk
output_dir = Path("./data/tokenised_ds")
ds_tok.save_to_disk(output_dir)
print(f"Tokenized dataset saved to {output_dir}")


Saving the dataset (0/1 shards):   0%|          | 0/170 [00:00<?, ? examples/s]

Tokenized dataset saved to data/tokenised_ds


In [35]:
# Create and load dataset from local_disk
vol = modal.Volume.from_name("data", create_if_missing=True)

with vol.batch_upload() as batch:
    batch.put_directory("./data/tokenised_ds", "/tokenized_ds")

In [51]:
fine_tune_app = modal.App("gpt2-xl-ft")
ft_image = modal.Image.debian_slim().pip_install(
    "transformers", "datasets", "accelerate", 
    "bitsandbytes", "peft", "torch"
)
GPU = "H100"
vol = modal.Volume.from_name("data")
model_vol = modal.Volume.from_name(
    "gpt2_ft", create_if_missing=True
)

@fine_tune_app.function(
    image=ft_image,
    gpu=GPU,
    timeout = 3600,
    volumes={"/data": vol, "/checkpoints": model_vol}
)
def train_model():
    from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
    from datasets import Dataset
    from datasets import load_from_disk
    
    # Load tokenized dataset from volume
    ds_tok = load_from_disk("/data/tokenized_ds")

    ds_tok = ds_tok.map(lambda ex: {"label": ex["input_ids"]})
    
    # Get the model
    model_name = "gpt2-xl"
    tokenizer = AutoTokenizer.from_pretrained(model_name, pad_token_id = 50256)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_name)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer      = tokenizer,
        mlm            = False,   # causal, not BERT‑style MLM
        return_tensors = "pt"
    )

    args = TrainingArguments(
        # foler where training artifacts go
        output_dir = "/checkpoints",
        # micro-batch size on each GPU
        per_device_train_batch_size = 1,
        # number of micro‑batches to accumulate before calling optimizer.step()
        gradient_accumulation_steps = 8,
        # number of epochs
        num_train_epochs = 3,
        # learning rate
        learning_rate = 5e-5,
        # use mixed precision (for faster training)
        bf16 = True, # H100
        # log every 50 steps
        logging_steps = 50,
        # no eval loops
        # evaluation_strategy = "no",     
        save_strategy = "epoch"  
    ) 

    print(f"Training model with dataset: {ds_tok}")
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_tok,
        data_collator=data_collator
    )
    trainer.train()

    trainer.save_model("/checkpoints/final")
    tokenizer.save_pretrained("/checkpoints/final")
    print("Fine-tune complete.  Model saved to /checkpoints/final")

In [52]:
with fine_tune_app.run():
    train_model.remote()

## 5. Fine-tuned Model Inference

In [56]:
final_vol = modal.Volume.from_name("gpt2_ft")

local_dir = Path("./models/gpt2-xl-finetuned-full")
local_dir.mkdir(parents=True, exist_ok=True)

In [None]:
## Download the model to local directory
## !modal volume get gpt2_ft /final ./models/gpt2-xl-finetuned-full 

In [66]:
ft_inference_app = modal.App("gpt2-xl-ft-inference")
GPU = "H100"
ft_inference_image = modal.Image.debian_slim().pip_install(
    "transformers==4.40.1",
    "sentencepiece",         
    "accelerate",  
)
ft_inference_vol = modal.Volume.from_name("gpt2_ft")

@ft_inference_app.function(
    image=ft_inference_image,
    gpu=GPU,
    timeout=3600,
    volumes={"/model": ft_inference_vol}
)
def ft_inference(prompt, max_length=200):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    import torch

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_path = "/model/final"

    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, padding_side="left", local_files_only=True)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
    model.eval()

    print(f"Model loaded from {model_path}")

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    
    outputs = model.generate(
        input_ids,
        max_length=200,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.9,
        eos_token_id=tokenizer.eos_token_id,
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

/var/folders/dp/j76v96ld7kgfdxz859wprxkw0000gn/T/ipykernel_8647/864885668.py:16: DeprecationError: 2025-02-03: Modal will stop implicitly adding local Python modules to the Image ("automounting") in a future update. The following modules need to be explicitly added for future compatibility:
* _remote_module_non_scriptable

e.g.:
image_with_source = my_image.add_local_python_source("_remote_module_non_scriptable")

For more information, see https://modal.com/docs/guide/modal-1-0-migration
  def ft_inference(prompt, max_length=200):


In [67]:
with ft_inference_app.run():      
    story = ft_inference.remote(
        "Once upon a time in the world of League of Legends,"
    )
    print(story)

Once upon a time in the world of League of Legends, the legendary jungler Xerath was the greatest of the Ascended. An immortal god with unfathomable power and an insatiable hunger for the flesh of others, Xerath rose against the Demacian empire with the goal of killing its most revered and beloved figure, the king Jarvan III. After all the deaths he had caused, and all the suffering he had caused, the thought of killing Jarvan had nearly consumed Xerath. But the Ascended had one final test to endure before he could finish off the hated king, and the final challenge was to slay Jarvan himself...
It was just after seven, and the streets were still bustling with people. People going about their daily routines, meeting friends and family, and making the most of the last few days of summer before the weather turned.

A tall, elegant woman approached a young boy standing in the middle of the crowd. He wore a white dress shirt
