## Data Analysis

In [5]:
import pandas as pd
import re

In [6]:
df = pd.read_csv('wikipedia_data10K.csv')
print(df.head())

                                              Title  \
0  Phoenix Wright: Ace Attorney – Spirit of Justice   
1                    Ammonium sulfate precipitation   
2                            Kinki (disambiguation)   
3                     Heartbeat (British TV series)   
4                                             Uolba   

                                                Text  
0  2016 video game 2016 video game Phoenix Wright...  
1  Ammonium sulfate precipitation is one of the m...  
2  Kinki may refer to: Kansai region , Japan; als...  
3  British television drama series (1992–2010) Th...  
4  Selo in Sakha Republic, Russia Uolba Уолба Sel...  


The title and text were both initially stored as strings

In [7]:
print('Type of title: ')
print(type(df['Title'][0]))
print('Type of text: ')
print(type(df['Text'][0]))

Type of title: 
<class 'str'>
Type of text: 
<class 'str'>


Collection algorithm was supposed to collect first 1000 words in each article. The reason the average number of words in an article may be lower is because many articles may have less than 1000 words in an article.

In [8]:
title_word_sum = 0
text_word_sum = 0

for index, row in df.iterrows():
    title_word_sum += len(row['Title'].split())
    text_word_sum += len(row['Text'].split())

title_avg_word_length = title_word_sum / df.shape[0]
text_avg_word_length = text_word_sum / df.shape[0]

print(f'The average number of words in each article title is: {title_avg_word_length}')
print(f'The average number of words in each article is: {text_avg_word_length}')


The average number of words in each article title is: 2.8973
The average number of words in each article is: 599.838


In [9]:
title_avg_char_length = df['Title'].str.len().sum() / df.shape[0]
text_avg_char_length = df['Text'].str.len().sum() / df.shape[0]

print(f'The average title length is: {title_avg_char_length} characters')
print(f'The average text length is: {text_avg_char_length} characters')

The average title length is: 20.0032 characters
The average text length is: 3676.5312 characters


In [10]:
unique_title_words = set()
unique_text_words = set()
unique_title_chars = set()
unique_text_chars = set()

for index, row in df.iterrows():
    for c in row['Title']:
        if c not in unique_title_chars:
            unique_title_chars.add(c)
    for word in row['Title'].split():
        if word not in unique_title_words:
            unique_title_words.add(word)
    for c in row['Text']:
        if c not in unique_text_chars:
            unique_text_chars.add(c)
    for word in row['Text'].split():
        if word not in unique_text_words:
            unique_text_words.add(word)

print(f'The number of unique characters in article titles is: {len(unique_title_chars)}')
print(f'The number of unique words in article titles is: {len(unique_title_words)}')
print(f'The number of unique characters in the article text is: {len(unique_text_chars)}')
print(f'The number of unique words in the article text is: {len(unique_text_words)}')


The number of unique characters in article titles is: 181
The number of unique words in article titles is: 15843
The number of unique characters in the article text is: 4526
The number of unique words in the article text is: 599970


In [11]:
!pip install datasets
!pip install transformers !pip install transformers [torch]
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
[31mERROR: Invalid requirement: '!pip'[0m[31m
[0mCollecting accelerate
  Downloading 

In [12]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, GPT2Tokenizer, GPT2LMHeadModel, pipeline, AutoTokenizer
import datasets
from datasets import load_dataset, list_datasets
from datasets import Dataset
from sklearn.model_selection import train_test_split


In [14]:
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)
# Reduce the size of the datasets to 3000 samples each
train_df = train_df.sample(n=1000, random_state=42)
val_df = val_df.sample(n=200, random_state=42)

# Create the AutoTokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define the function to encode your data
def encode(batch):
    return tokenizer([x.strip('\n\r') for x in batch['Text']], truncation=True, padding=True)

# Load and preprocess the dataset
dataset = Dataset.from_pandas(train_df)
processed_dataset = dataset.map(encode, batched=True, batch_size=len(dataset))
processed_dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

val_dataset = Dataset.from_pandas(val_df)
processed_val_dataset = val_dataset.map(encode, batched=True, batch_size=len(val_dataset))
processed_val_dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

# Load and fine-tune the GPT-2 model

training_args = TrainingArguments(
    output_dir='/content/',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    logging_steps=100,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=processed_dataset,
    eval_dataset=processed_val_dataset,
)

trainer.train()
# change fine-tuned model name to trc
trainer.save_model('./your_finetuned_model')

OutOfMemoryError: ignored

In [16]:
import torch
torch.cuda.memory_summary(device=None, abbreviated=False)



In [None]:
from transformers import pipeline
gpt2 = pipeline('text-generation', model='gpt2', device=0)
# change fine-tuned model name to trc
trc = pipeline('text-generation', model='your_finetuned_model', device=0)

print(gpt2('Virtual Box'))
print(trc('Virtual Box'))

In [None]:
print(gpt2('The Beginning'))
print(trc('The Beginning'))