In [None]:
!pip install transformers datasets pandas




In [None]:
import pandas as pd

# Load the data
data = pd.read_csv("/content/train.csv")

# Select the 'Name' column as the text data
texts = data["Name"]

# Display the first few rows to ensure the text data is correct
print(texts.head())


0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

def preprocess_text(text):
    # Tokenize text
    tokens = tokenizer.tokenize(text)
    return tokens

processed_texts = list(map(preprocess_text, texts))


In [None]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")


In [None]:
!pip install transformers[torch] accelerate -U




In [None]:
from google.colab import files
uploaded = files.upload()


Saving train.csv to train (1).csv


In [None]:
# Install necessary libraries
!pip install transformers[torch] accelerate -U datasets pandas

# Data Preparation
import pandas as pd

# Load the data
data = pd.read_csv("/content/ICC Mens T20 Worldcup.csv")

# Check the column names in the DataFrame
print(data.columns)

# Select the correct column name (replace 'NameofColumnContainingText' with the actual name)
texts = data["Best Bowler"]

# Pre-processing of Data
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

def preprocess_text(text):
    # Tokenize text
    tokens = tokenizer.tokenize(text)
    return tokens

processed_texts = list(map(preprocess_text, texts))

# Selecting a Model Architecture and Loading the Model
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")

# Creating a Pipeline
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Convert the processed texts to a Hugging Face Dataset
dataset = Dataset.from_dict({"text": [" ".join(tokens) for tokens in processed_texts]})

# Create a TextDataset
def create_text_dataset(dataset, tokenizer, block_size=128):
    def encode(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=block_size)

    # Map the encoding function to the dataset
    encoded_dataset = dataset.map(encode, batched=True)
    return encoded_dataset

text_dataset = create_text_dataset(dataset, tokenizer)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=text_dataset,
)

# Train the model
trainer.train()

# Generating Text with the Trained Model
from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text
generated_text = generator("Best bowler", max_length=50, num_return_sequences=1)
print(generated_text)

Index(['Match No.', 'Date', 'Venue', '1st Team', '2nd Team', 'Stage',
       'Toss Winning', 'Toss Decision', 'First Innings Score',
       'Fall of wickets First Innings', 'Second Innings Score',
       'Fall of wickets Second Innings', 'Winners', 'Method', 'Won by',
       'Winning Margin', 'Top Scorer', 'Highest Score', 'Best Bowler',
       'Best Bowler Figure(Wickets Taken)',
       'Best Bowler Figure(Runs Recieved)', 'Player Of The Match'],
      dtype='object')


Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Step,Training Loss


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'Best bowler, if I should. Please not be."\n\nHe was just like those old people. If any human\'s heart is not like these.\n\nAs he said,\n\nThose who don\'t have heart, have broken.'}]
