In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline, logging, T5ForConditionalGeneration, T5Tokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
from trl import SFTTrainer
import pandas as pd
import torch
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split


In [2]:
model_path = "google/flan-t5-small"

quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=False)

tokenizer = AutoTokenizer.from_pretrained(model_path, truncation=True, padding=True)
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained(model_path, device_map="auto", quantization_config=quant_config)

peft_model_path = "./flan-t5-small-fine-tuned/"

peft_model = PeftModel.from_pretrained(peft_model_base, peft_model_path, is_trainable=False, device_map="auto")
peft_model.print_trainable_parameters()

trainable params: 0 || all params: 79,713,664 || trainable%: 0.0


In [3]:
def generate_summary(row):    
    device = peft_model.device
    inputs = tokenizer(row['question'], return_tensors="pt").to(device)
    outputs = peft_model.generate(**inputs, max_new_tokens=512)
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    return pd.Series([row['question'], answer], index=['question', 'answer'])

llama_3_data = pd.read_parquet("data/data_cleaned.parquet", engine="pyarrow")
llama_3_data

Unnamed: 0,filename,notebook_data,line_count,char_count,question,answer
0,1000010.ipynb,import pandas as pd\nimport seaborn as sns\nim...,44,2895,Summarize the following code in two to three s...,The code intends to analyze and visualize happ...
1,1000014.ipynb,# This Python 3 environment comes with many he...,41,1268,Summarize the following code in two to three s...,The code is intended to train a linear model u...
2,1000018.ipynb,# This Python 3 environment comes with many he...,58,3119,Summarize the following code in two to three s...,The code intends to analyze and classify data ...
3,1000025.ipynb,"As the name says, all I am doing here is clean...",122,6354,Summarize the following code in two to three s...,The code is focused on cleaning and preprocess...
4,1000028.ipynb,# This Python 3 environment comes with many he...,22,925,Summarize the following code in two to three s...,The code intends to load and process data from...
...,...,...,...,...,...,...
3495,1025649.ipynb,\nthis is a combination of my learning from ka...,243,9120,Summarize the following code in two to three s...,The code aims to predict who survived the sink...
3496,1025658.ipynb,#Interacting with Data\nAn interactive visuali...,62,2567,Summarize the following code in two to three s...,The code is designed to create interactive vis...
3497,1025660.ipynb,\nthis is a combination of my learning from ka...,283,10752,Summarize the following code in two to three s...,The code aims to predict who survived the sink...
3498,1025661.ipynb,**Exploring Homicide Reports 1980-2014!**\n\nH...,167,7658,Summarize the following code in two to three s...,The code explores homicide reports from 1980 t...


In [4]:
train_df, test_df = train_test_split(llama_3_data, test_size=0.2, random_state=42) 
test_df

Unnamed: 0,filename,notebook_data,line_count,char_count,question,answer
1650,1010830.ipynb,# Load packages\nlibrary('ggplot2') # visualiz...,12,436,Summarize the following code in two to three s...,The code loads various packages for data visua...
2456,1016708.ipynb,# This Python 3 environment comes with many he...,16,806,Summarize the following code in two to three s...,The code imports necessary libraries for data ...
2232,1014662.ipynb,# This Python 3 environment comes with many he...,13,679,Summarize the following code in two to three s...,The code imports necessary libraries for data ...
1945,1012749.ipynb,# This Python 3 environment comes with many he...,13,679,Summarize the following code in two to three s...,The code imports necessary libraries for data ...
309,1001858.ipynb,"# Crime in Chicago\n## What, Where, and When\n...",252,10343,Summarize the following code in two to three s...,The code intends to analyze crime data in Chic...
...,...,...,...,...,...,...
3127,1023076.ipynb,# This Python 3 environment comes with many he...,41,1694,Summarize the following code in two to three s...,The code intends to load and preprocess data f...
744,1004830.ipynb,# NYC Taxi Hot Spots\nimport numpy as np\nimpo...,35,1119,Summarize the following code in two to three s...,The code intends to analyze and visualize data...
631,1004357.ipynb,# Predicting the price of the car using regres...,82,3674,Summarize the following code in two to three s...,The code is designed to predict the price of a...
1557,1010294.ipynb,# This Python 3 environment comes with many he...,13,679,Summarize the following code in two to three s...,The code imports necessary libraries for data ...


In [5]:
tqdm.pandas()

df = pd.DataFrame(columns=['question', 'answer'])
df = test_df.progress_apply(generate_summary, axis=1)
df

  0%|          | 0/700 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3937 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,question,answer
1650,Summarize the following code in two to three s...,# bind training & test data str(full) full
2456,Summarize the following code in two to three s...,# This Python 3 environment comes with many he...
2232,Summarize the following code in two to three s...,# kaggle/python docker image: https://github.c...
1945,Summarize the following code in two to three s...,# kaggle/python docker image: https://github.c...
309,Summarize the following code in two to three s...,"### Crime in Chicago ### What, Where, and When..."
...,...,...
3127,Summarize the following code in two to three s...,# This Python 3 environment comes with many he...
744,Summarize the following code in two to three s...,# NYC Taxi Hot Spots import numpy as np import...
631,Summarize the following code in two to three s...,# Predicting the price of the car using regres...
1557,Summarize the following code in two to three s...,# kaggle/python docker image: https://github.c...


In [6]:
print(df['answer'][2456])

# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt # Input data files are available in the "../input/" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory from subprocess import check_output print(check_output(["ls", "../input"]).decode("utf8")) # Any results you write to the current directory are saved as output. df_train = pd.read_csv('../input/train.csv') df_train.head() plt.hist(df_train['is_duplicate'])


In [7]:
df.to_parquet("data/outputs/flan_t5_small_fine_tuned.parquet", engine="pyarrow")