In [1]:
!ls Articles.csv

Articles.csv


In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
def cleaning(s):
    s = str(s)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s

In [4]:
df = pd.read_csv("Articles.csv", encoding="ISO-8859-1") 
df = df.dropna()

In [6]:
text_data = open('Articles.txt', 'w')
for idx, item in df.iterrows():
    article = cleaning(item["Article"])
    text_data.write(article)
text_data.close()

### Training the model

In [9]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

2025-03-10 01:18:06.238195: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741583887.127503   12414 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741583887.301155   12414 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-10 01:18:08.909108: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [12]:
# Parameters 
train_file_path = "./Articles.txt"
model_name = 'gpt2'
output_dir = './result'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [13]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)


  x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,3.6603
1000,3.3909
1500,3.1463
2000,3.097
2500,2.9591
3000,2.9298
3500,2.8452
4000,2.8199
4500,2.7666
5000,2.7624


### Inference

In [14]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [15]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "./result"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))


In [17]:
sequence = "oil price"
max_len =  500
generate_text(sequence, max_len) # oil price for July June which had been low at as low as was originally stated Prices have since resumed

oil price rebound was a key driver in the mmodity market," said John Atkinson, chief investment officer at CMC Markets in Sydney."We still see oil peaking at around and a barrel is still very much on the downside, albeit with a very steep downside," he added.The dollar, which had settled at . yen in New York afternoon trade, rose . percent to . yen in early Asian trade."It looks likely a stronger greenback may provide some support to the dollar, but with oil ntinuing to rise, it uld potentially ntinue to weaken," Atkinson added.strong>KARACHI: Pakistan stocks closed Thursday on more cautious steps as the global financial crisis roiled the untry, with Pakistani stock markets revering after the untry´s previous plunge.</strongInternational benchmark Brent futures LCOc were trading at . per barrel at GMT in early Asian trade, up cents from their last close. The dollar edged higher against the greenback, falling on the strength of strong Chinese manufacturing data and worries about the Uni