In [1]:
!pip install accelerate==0.27.2

Collecting accelerate==0.27.2
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m225.3/280.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate==0.27.2)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate==0.27.2)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [7]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import requests
from bs4 import BeautifulSoup
import tempfile

def scrape_data(links):
    data = ""
    for link in links:
        response = requests.get(link)
        soup = BeautifulSoup(response.text, "html.parser")
        paragraphs = soup.find_all("p")
        for paragraph in paragraphs:
            data += paragraph.text + "\n"
    return data

def prepare_dataset(data):
    tokenizer = AutoTokenizer.from_pretrained("gpt2")

    with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
        temp_file.write(data)
        temp_file_path = temp_file.name

    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=temp_file_path,
        block_size=128,
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False
    )
    return dataset, data_collator

def train_model(dataset, data_collator):
    model = AutoModelForCausalLM.from_pretrained("gpt2")
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator,
    )
    trainer.train()
    return model

def generate_summary(model, team_name):
    generator = pipeline("text-generation", model=model, tokenizer="gpt2")
    prompt = f"Provide a detailed summary of {team_name}:"
    summary = generator(prompt, max_length=1000, num_return_sequences=1)[0]["generated_text"]
    return summary

In [2]:
links = ["https://en.wikipedia.org/wiki/Arsenal_F.C.",
    "https://www.footballhistory.org/club/arsenal.html",
    "https://www.arsenalinsider.com/club/club-history/",
    "https://www.britannica.com/topic/Arsenal-English-football-club",
    "https://www.bbc.com/storyworks/top-teams-uncovered/arsenal",
    "https://www.arsenalfcyears.com/",
    "https://www.soccermaniak.com/arsenal-history.html"]

In [8]:
links = ["https://en.wikipedia.org/wiki/Manchester_United_F.C.",
    "https://www.footballhistory.org/club/manchester-united.html",
    "https://americanreddevils.com/the-history-of-manchester-united-football-club/",
    "https://theforkball.com/manchester-united-history-a-journey-through-time-and-trophies/",
    "https://spartacus-educational.com/FmanchesterU.htm",
    "https://www.reeditionmagazine.com/to-the-minute/the-story-of-the-man-utd-football-club",
    "https://www.zippia.com/manchester-united-careers-1573651/history/"]

In [9]:
scraped_data = scrape_data(links)
dataset, data_collator = prepare_dataset(scraped_data)
model = train_model(dataset, data_collator)

team_name = "Manchester United"
summary = generate_summary(model, team_name)

output_file = "summary.txt"
with open(output_file, "w") as file:
    file.write(summary)

print(f"Summary saved to {output_file}")

Token indices sequence length is longer than the specified maximum sequence length for this model (23000 > 1024). Running this sequence through the model will result in indexing errors


Step,Training Loss


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Summary saved to summary.txt


In [10]:
!cat summary.txt

Provide a detailed summary of Manchester United: General Statistics from 2002-03

The following table displays the percentage of the Manchester United team that were members of a consortium that received an annual compensation each season from the Football Association. The average cost of a team was reported to the club each year the following year.[34][35][36][37] This includes clubs that have been formed into five different clubs.[38]

Manchester United has had its share of financial misfortunes, most notably the loss of its current home ground with former manager Sir Alex Ferguson.

Football Club

The FA Community Shield

Premier League

Premier League

First Division

Newcastle Stadium
Manchester United has won the FA Community Shield in 14 times and claimed the crown in 12 different trophies: UEFA Super Cup, UEFA Europa League and Intercontinental Champions League. Following the 2010/11 season, the club has always taken part in the United Supporters' Trust of the English Premier L

In [11]:
import torch
from transformers import pipeline

pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.bfloat16, device_map="auto")

messages = [
    {
        "role": "system",
        "content": "You are a chatbot answering football queries in detail",
    },
    {"role": "user", "content": "Tell me about Arsenal's achievements from 2000 to 2024"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

<|system|>
You are a chatbot answering football queries in detail</s>
<|user|>
Tell me about Arsenal's achievements from 2000 to 2024</s>
<|assistant|>
Arsenal, a British football club based in London, has achieved many achievements during its history. Here are some notable ones:

2000-2005:
- The club won the Premier League in 2002, 2003, 2004, and 2005.
- They also won the FA Cup in 2002, 2004, and 2005.
- The club won the Champions League in 2005, which was their first major trophy since 1988.

2006-2019:
- The club won the Premier League again in 2007, 2008, and 2010.
- They won the FA Cup in 2009, 2010, and 2011.
- The club won the Champions League in 2014, which was their third major trophy since 2005.

2020-present:
- The club won the Premier League in 2020.
- They also won
