In [1]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_path = "google/flan-t5-small"

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, truncation=True, padding=True)

In [2]:
def generate_summary(row):    
    inputs = tokenizer(row['question'], return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=512)
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    return pd.Series([row['question'], answer], index=['question', 'answer'])

In [3]:
llama_3_data = pd.read_parquet("data/data_cleaned.parquet", engine="pyarrow")
llama_3_data

Unnamed: 0,filename,notebook_data,line_count,char_count,question,answer
0,1000010.ipynb,import pandas as pd\nimport seaborn as sns\nim...,44,2895,Summarize the following code in two to three s...,The code intends to analyze and visualize happ...
1,1000014.ipynb,# This Python 3 environment comes with many he...,41,1268,Summarize the following code in two to three s...,The code is intended to train a linear model u...
2,1000018.ipynb,# This Python 3 environment comes with many he...,58,3119,Summarize the following code in two to three s...,The code intends to analyze and classify data ...
3,1000025.ipynb,"As the name says, all I am doing here is clean...",122,6354,Summarize the following code in two to three s...,The code is focused on cleaning and preprocess...
4,1000028.ipynb,# This Python 3 environment comes with many he...,22,925,Summarize the following code in two to three s...,The code intends to load and process data from...
...,...,...,...,...,...,...
3495,1025649.ipynb,\nthis is a combination of my learning from ka...,243,9120,Summarize the following code in two to three s...,The code aims to predict who survived the sink...
3496,1025658.ipynb,#Interacting with Data\nAn interactive visuali...,62,2567,Summarize the following code in two to three s...,The code is designed to create interactive vis...
3497,1025660.ipynb,\nthis is a combination of my learning from ka...,283,10752,Summarize the following code in two to three s...,The code aims to predict who survived the sink...
3498,1025661.ipynb,**Exploring Homicide Reports 1980-2014!**\n\nH...,167,7658,Summarize the following code in two to three s...,The code explores homicide reports from 1980 t...


In [4]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(llama_3_data, test_size=0.2, random_state=42) # split data, no training for this one, only evaluating base model
test_df

Unnamed: 0,filename,notebook_data,line_count,char_count,question,answer
1650,1010830.ipynb,# Load packages\nlibrary('ggplot2') # visualiz...,12,436,Summarize the following code in two to three s...,The code loads various packages for data visua...
2456,1016708.ipynb,# This Python 3 environment comes with many he...,16,806,Summarize the following code in two to three s...,The code imports necessary libraries for data ...
2232,1014662.ipynb,# This Python 3 environment comes with many he...,13,679,Summarize the following code in two to three s...,The code imports necessary libraries for data ...
1945,1012749.ipynb,# This Python 3 environment comes with many he...,13,679,Summarize the following code in two to three s...,The code imports necessary libraries for data ...
309,1001858.ipynb,"# Crime in Chicago\n## What, Where, and When\n...",252,10343,Summarize the following code in two to three s...,The code intends to analyze crime data in Chic...
...,...,...,...,...,...,...
3127,1023076.ipynb,# This Python 3 environment comes with many he...,41,1694,Summarize the following code in two to three s...,The code intends to load and preprocess data f...
744,1004830.ipynb,# NYC Taxi Hot Spots\nimport numpy as np\nimpo...,35,1119,Summarize the following code in two to three s...,The code intends to analyze and visualize data...
631,1004357.ipynb,# Predicting the price of the car using regres...,82,3674,Summarize the following code in two to three s...,The code is designed to predict the price of a...
1557,1010294.ipynb,# This Python 3 environment comes with many he...,13,679,Summarize the following code in two to three s...,The code imports necessary libraries for data ...


In [5]:
from tqdm.notebook import tqdm
tqdm.pandas()

df = pd.DataFrame(columns=['question', 'answer'])

df = test_df.progress_apply(generate_summary, axis=1)
df

  0%|          | 0/700 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3937 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,question,answer
1650,Summarize the following code in two to three s...,# bind training & test data str(full) full
2456,Summarize the following code in two to three s...,"# python = lambda: map(int, input().split()) d..."
2232,Summarize the following code in two to three s...,"# coding: utf-8 -*- coding: utf-8 -*- """""" Crea..."
1945,Summarize the following code in two to three s...,"# coding: utf-8 -*- coding: utf-8 -*- """""" Crea..."
309,Summarize the following code in two to three s...,"### What, Where, and When ### 2012-2016 import..."
...,...,...
3127,Summarize the following code in two to three s...,# import python # import python # import pytho...
744,Summarize the following code in two to three s...,"# -*- coding: utf-8 -*- """""" Created on Mon Nov..."
631,Summarize the following code in two to three s...,# -*- coding: utf-8 -*- nfolds = 3 bestREG = '...
1557,Summarize the following code in two to three s...,"# coding: utf-8 -*- coding: utf-8 -*- """""" Crea..."


In [6]:
print(df['answer'][2456])

# python = lambda: map(int, input().split()) df_train = lambda: map(int, input().split()) df_train.head()


In [7]:
df.to_parquet("data/outputs/flan_t5_small.parquet", engine="pyarrow")