In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tqdm import tqdm, trange
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')



# Summaries train and Prompts Train Analysis

In [2]:
PATH = '/kaggle/input/commonlit-evaluate-student-summaries'
summaries_train = pd.read_csv(f'{PATH}/summaries_train.csv')
summaries_train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [3]:
summaries_train.shape

(7165, 5)

In [4]:
summaries_train.nunique()

student_id    7165
prompt_id        4
text          7165
content       1134
wording       1134
dtype: int64

In [5]:
summaries_train['prompt_id'].value_counts()

39c16e    2057
3b9047    2009
ebad26    1996
814d6b    1103
Name: prompt_id, dtype: int64

In [6]:
prompts_train = pd.read_csv(f'{PATH}/prompts_train.csv')
prompts_train.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [7]:
prompts_train.shape

(4, 4)

Now let's join these two dataframe based on the prompt id for better know how of the problem.

In [8]:
# Perform the inner join based on the 'ID' column
df = pd.merge(summaries_train, prompts_train,
              on='prompt_id', how='inner')
df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...


In [9]:
df['prompt_id'].value_counts()

39c16e    2057
3b9047    2009
ebad26    1996
814d6b    1103
Name: prompt_id, dtype: int64

This make sure that the merging of the two dataframes has been done without any error. As the number of occurences for each prompt id is same as that of the original one.

In [10]:
df.iloc[0]

student_id                                              000e8c3c7ddb
prompt_id                                                     814d6b
text               The third wave was an experimentto see how peo...
content                                                     0.205683
wording                                                     0.380538
prompt_question    Summarize how the Third Wave developed over su...
prompt_title                                          The Third Wave
prompt_text        Background \r\nThe Third Wave experiment took ...
Name: 0, dtype: object

In [11]:
df.iloc[0].prompt_question

'Summarize how the Third Wave developed over such a short period of time and why the experiment was ended.'

In [12]:
df.iloc[0].text

'The third wave was an experimentto see how people reacted to a new one leader government. It gained popularity as people wanted to try new things. The students follow anything that is said and start turning on eachother to gain higher power. They had to stop the experement as too many people got to radical with it blindly following there leader'

In [13]:
df.iloc[0].prompt_text

'Background \r\nThe Third Wave experiment took place at Cubberley High School in Palo Alto, California during the first week of April 1967. History teacher Ron Jones, finding himself unable to explain to his students how people throughout history followed the crowd even when terrible things were happening, decided to demonstrate it to his students through an experiment. Jones announced that he was starting a movement aimed to eliminate democracy. Jones named the movement “The Third Wave” as a symbol of strength, referring to the mythical belief that the third in a series of waves is the strongest. One of the central points of this movement was that democracy’s main weakness is that it favors the individual over the whole community. Jones emphasized this main point of the movement when he created this catchy motto: “Strength through discipline, strength through community, strength through action, strength through pride.” \r\nThe Experiment \r\nJones started the first day of the experime

In [14]:
prompts = list()
for i in trange(len(df.index)):

    prompt_for_finetune = f'''
    Below is a task to provide scores for the sumamries that is\
    written by a student based on the text or a passage and a\
    question which is provided to him.

    INPUT:
    Passage Title:
    {df.iloc[i].prompt_title}
    
    Passage:
    {df.iloc[i].prompt_text}

    Question:
    {df.iloc[i].prompt_question}

    Summary Written by Student:
    {df.iloc[i].text}
    
    You are supposed to give the output as a python dictionary \
    having keys 'content' and 'wording'.
    Output:
    'content': {df.iloc[i].content}
    'wording': {df.iloc[i].wording}
    '''
    prompts.append(prompt_for_finetune)

df['prompt_for_finetune'] = prompts

100%|██████████| 7165/7165 [00:04<00:00, 1520.02it/s]


In [15]:
features = df['prompt_for_finetune']
target = df[['content']]

X_train, X_val, Y_train, Y_val = train_test_split(features, target,
                                                  random_state=2023,
                                                  test_size=0.2)
X_train.shape, X_val.shape

((5732,), (1433,))

# Trying BERT for the Predictions

In [16]:
!pip install -q -U trl transformers git+https://github.com/huggingface/peft.git
!pip install bitsandbytes
!pip install accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.40.2-py3-none-any.whl (92.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.40.2


In [17]:
df[['prompt_for_finetune']]

Unnamed: 0,prompt_for_finetune
0,\n Below is a task to provide scores for th...
1,\n Below is a task to provide scores for th...
2,\n Below is a task to provide scores for th...
3,\n Below is a task to provide scores for th...
4,\n Below is a task to provide scores for th...
...,...
7160,\n Below is a task to provide scores for th...
7161,\n Below is a task to provide scores for th...
7162,\n Below is a task to provide scores for th...
7163,\n Below is a task to provide scores for th...


In [18]:
from datasets import Dataset
dataset = Dataset.from_pandas(df[['prompt_for_finetune']])
dataset

Dataset({
    features: ['prompt_for_finetune', '__index_level_0__'],
    num_rows: 7165
})

In [19]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer,\
BitsAndBytesConfig, AutoTokenizer

model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

Downloading (…)lve/main/config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00014.bin:   0%|          | 0.00/981M [00:00<?, ?B/s]

Downloading (…)l-00002-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00005-of-00014.bin:   0%|          | 0.00/944M [00:00<?, ?B/s]

Downloading (…)l-00006-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00007-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00008-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00009-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00010-of-00014.bin:   0%|          | 0.00/944M [00:00<?, ?B/s]

Downloading (…)l-00011-of-00014.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00012-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00013-of-00014.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)l-00014-of-00014.bin:   0%|          | 0.00/847M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [20]:
tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [21]:
from peft import LoraConfig, get_peft_model

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

In [22]:
import os

os.mkdir('./results')

In [23]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 2
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_steps = 100
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 100
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [24]:
import gc
gc.collect()

51

In [25]:
from trl import SFTTrainer

max_seq_length = 1024

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="prompt_for_finetune",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments
)

  0%|          | 0/8 [00:00<?, ?ba/s]

In [26]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [None]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.862


In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
model_to_save.save_pretrained("final_finetuned_model")

# Testing the Model

In [None]:
df.iloc[0].prompt_for_finetune[:-92]

In [None]:
batch = tokenizer(df['prompt_for_finetune'].str[:-100], tensors='pt')

with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, max_new_tokens=1024)

print('\n\n', tokenizer.decode(output_tokens[0],
                               skip_special_tokens=False))

# Making Predictions

In [None]:
summaries_test = pd.read_csv(f'{PATH}/summaries_test.csv')
prompts_test = pd.read_csv(f'{PATH}/prompts_test.csv')
test = pd.merge(summaries_test, prompts_test, 
                on='prompt_id', how='inner')
test.head()

In [None]:
prompts = list()
for i in trange(len(test.index)):

    prompt_for_finetune = f'''
    Below is a task to provide scores for the sumamries that is written by a student based on the text or a passage and a question which is provided to him.

    INPUT:
    Passage Title:
    {test.iloc[i].prompt_title}
    
    Passage:
    {test.iloc[i].prompt_text}

    Question:
    {test.iloc[i].prompt_question}

    Summary Written by Student:
    {test.iloc[i].text}
    '''
    prompts.append(prompt_for_finetune)

test['prompt_for_preds'] = prompts

In [None]:
batch = tokenizer(test['prompt_for_preds'], tensors='pt')

with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=False))