# This notebook explores the use of Meta Llama 3 8B Instruct model for summarizing code snippets.

Prompt engineering was accomplished in this notebook and a small set of 3500 rows was saved as data_cleaned.parquet.

In [1]:
import pandas as pd

df = pd.read_parquet('data_filtered.parquet', engine='pyarrow')
df = df.reset_index(drop=True)
df

Unnamed: 0,filename,notebook_data,line_count,char_count
0,1000010.ipynb,import pandas as pd\nimport seaborn as sns\nim...,44,2895
1,1000014.ipynb,# This Python 3 environment comes with many he...,41,1268
2,1000018.ipynb,# This Python 3 environment comes with many he...,58,3119
3,1000025.ipynb,"As the name says, all I am doing here is clean...",122,6354
4,1000028.ipynb,# This Python 3 environment comes with many he...,22,925
...,...,...,...,...
144390,999966.ipynb,# Overview\nAll programmer know the basic cycl...,79,4877
144391,999968.ipynb,Learning \n# This Python 3 environment comes w...,42,1567
144392,999980.ipynb,# Mean-Encoding High Cardinality Categoricals ...,128,5854
144393,999988.ipynb,**<h1>LEAF CLASSIFICATION</h1>**\n# Package Im...,91,3590


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

base_model = "meta-llama/Meta-Llama-3-8B-Instruct"

quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=False)

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
    
compute_dtype = getattr(torch, "float16")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
model = AutoModelForCausalLM.from_pretrained(base_model, device_map="auto", torch_dtype=compute_dtype, quantization_config=quant_config, low_cpu_mem_usage=True, temperature=0.1, do_sample=True)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
def generate_response(prompt):
    chat = [{'content': prompt, 'role': 'user'}]
    chat_tokens = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=True, return_tensors='pt').to(model.device)

    new_chat_tokens = model.generate(chat_tokens, do_sample=False, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id)
    new_chat_str = tokenizer.decode(new_chat_tokens[0])
    return new_chat_str

In [5]:
# prompt engineering

# task = "Summarize the following code in two to three sentences. Focus on the intentions of the code, emphasize the type of data being handled, and do nothing else.\n\n"

task = "Summarize the following code in two to three sentences. Focus on the intentions of the code, emphasize the type of data being handled, do not give an introductory clause. do nothing else.\n\n"

# task = "Summarize the following code in two to three sentences. Focus on the intentions of the code, detail the data being handled, do not give an introductory clause. do nothing else"

input_text = '''chunksize = 100000
df_chunks = []

for df in pd.read_csv(
    crimes_raw_file_path, parse_dates=[2], index_col=[2],
    chunksize=chunksize, iterator=True, low_memory=True
):
    df = df['2016-01-01':'2016-12-31']
    df_chunks.append(df)

crimes2016 = pd.concat(df_chunks)

# Serialising & compressing the data
crimes2016.sort_index(ascending=True).to_csv(crimes_file_path)

crimes2016.head()'''
print(generate_response(task + input_text))

  attn_output = torch.nn.functional.scaled_dot_product_attention(


<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Summarize the following code in two to three sentences. Focus on the intentions of the code, emphasize the type of data being handled, do not give an introductory clause. do nothing else.

chunksize = 100000
df_chunks = []

for df in pd.read_csv(
    crimes_raw_file_path, parse_dates=[2], index_col=[2],
    chunksize=chunksize, iterator=True, low_memory=True
):
    df = df['2016-01-01':'2016-12-31']
    df_chunks.append(df)

crimes2016 = pd.concat(df_chunks)

# Serialising & compressing the data
crimes2016.sort_index(ascending=True).to_csv(crimes_file_path)

crimes2016.head()<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The code reads a large CSV file in chunks, filters the data to a specific date range, and concatenates the chunks into a single DataFrame. The intention is to process and store the data in a compressed format, likely for further analysis or storage. The data being handled is crime-related information,

In [6]:
response = generate_response('''Summarize the following code in two to three sentences. Focus on the intentions of the code, emphasize the type of data being handled, and do nothing else. 

chunksize = 100000
df_chunks = []

for df in pd.read_csv(
    crimes_raw_file_path, parse_dates=[2], index_col=[2],
    chunksize=chunksize, iterator=True, low_memory=True
):
    df = df['2016-01-01':'2016-12-31']
    df_chunks.append(df)

crimes2016 = pd.concat(df_chunks)

# Serialising & compressing the data
crimes2016.sort_index(ascending=True).to_csv(crimes_file_path)

crimes2016.head()''')
response

"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nSummarize the following code in two to three sentences. Focus on the intentions of the code, emphasize the type of data being handled, and do nothing else. \n\nchunksize = 100000\ndf_chunks = []\n\nfor df in pd.read_csv(\n    crimes_raw_file_path, parse_dates=[2], index_col=[2],\n    chunksize=chunksize, iterator=True, low_memory=True\n):\n    df = df['2016-01-01':'2016-12-31']\n    df_chunks.append(df)\n\ncrimes2016 = pd.concat(df_chunks)\n\n# Serialising & compressing the data\ncrimes2016.sort_index(ascending=True).to_csv(crimes_file_path)\n\ncrimes2016.head()<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe code intends to read a large CSV file, process it in chunks, and then concatenate the chunks into a single DataFrame. The data being handled is crime records from 2016, with dates parsed and used as the index. The processed data is then written to a new CSV file.<|eot_id|>"

In [7]:
# understand parsing the response
parts = response.split("<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n")
parts = parts[1].split("<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n")
prompt = parts[0].strip()
answer = parts[1].split("<|eot_id|>")[0].strip()
prompt, answer

("Summarize the following code in two to three sentences. Focus on the intentions of the code, emphasize the type of data being handled, and do nothing else. \n\nchunksize = 100000\ndf_chunks = []\n\nfor df in pd.read_csv(\n    crimes_raw_file_path, parse_dates=[2], index_col=[2],\n    chunksize=chunksize, iterator=True, low_memory=True\n):\n    df = df['2016-01-01':'2016-12-31']\n    df_chunks.append(df)\n\ncrimes2016 = pd.concat(df_chunks)\n\n# Serialising & compressing the data\ncrimes2016.sort_index(ascending=True).to_csv(crimes_file_path)\n\ncrimes2016.head()",
 'The code intends to read a large CSV file, process it in chunks, and then concatenate the chunks into a single DataFrame. The data being handled is crime records from 2016, with dates parsed and used as the index. The processed data is then written to a new CSV file.')

In [8]:
# copy to make a small scale dataframe
df_test = df[:3500].copy() # 3500 samples is approximately 3 hrs and 14 min of runtime on rtx 4090

In [9]:
def summarize(row):
    response = generate_response(task + row['notebook_data'])
    parts = response.split("<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n")
    parts = parts[1].split("<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n")
    prompt = parts[0].strip()
    answer = parts[1].split("<|eot_id|>")[0].strip()
    return pd.Series([prompt, answer], index=['question', 'answer'])

In [10]:
df_test

Unnamed: 0,filename,notebook_data,line_count,char_count
0,1000010.ipynb,import pandas as pd\nimport seaborn as sns\nim...,44,2895
1,1000014.ipynb,# This Python 3 environment comes with many he...,41,1268
2,1000018.ipynb,# This Python 3 environment comes with many he...,58,3119
3,1000025.ipynb,"As the name says, all I am doing here is clean...",122,6354
4,1000028.ipynb,# This Python 3 environment comes with many he...,22,925
...,...,...,...,...
3495,1025649.ipynb,\nthis is a combination of my learning from ka...,243,9120
3496,1025658.ipynb,#Interacting with Data\nAn interactive visuali...,62,2567
3497,1025660.ipynb,\nthis is a combination of my learning from ka...,283,10752
3498,1025661.ipynb,**Exploring Homicide Reports 1980-2014!**\n\nH...,167,7658


In [11]:
%%time
df_test[['question', 'answer']] = df_test.apply(summarize, axis=1)



CPU times: total: 2h 51min 21s
Wall time: 3h 19min 17s


In [12]:
df_test

Unnamed: 0,filename,notebook_data,line_count,char_count,question,answer
0,1000010.ipynb,import pandas as pd\nimport seaborn as sns\nim...,44,2895,Summarize the following code in two to three s...,The code intends to analyze and visualize happ...
1,1000014.ipynb,# This Python 3 environment comes with many he...,41,1268,Summarize the following code in two to three s...,The code is intended to train a linear model u...
2,1000018.ipynb,# This Python 3 environment comes with many he...,58,3119,Summarize the following code in two to three s...,The code intends to analyze and classify data ...
3,1000025.ipynb,"As the name says, all I am doing here is clean...",122,6354,Summarize the following code in two to three s...,The code is focused on cleaning and preprocess...
4,1000028.ipynb,# This Python 3 environment comes with many he...,22,925,Summarize the following code in two to three s...,The code intends to load and process data from...
...,...,...,...,...,...,...
3495,1025649.ipynb,\nthis is a combination of my learning from ka...,243,9120,Summarize the following code in two to three s...,The code aims to predict who survived the sink...
3496,1025658.ipynb,#Interacting with Data\nAn interactive visuali...,62,2567,Summarize the following code in two to three s...,The code is designed to create interactive vis...
3497,1025660.ipynb,\nthis is a combination of my learning from ka...,283,10752,Summarize the following code in two to three s...,The code aims to predict who survived the sink...
3498,1025661.ipynb,**Exploring Homicide Reports 1980-2014!**\n\nH...,167,7658,Summarize the following code in two to three s...,The code explores homicide reports from 1980 t...


In [13]:
df_test['question'][0]

"Summarize the following code in two to three sentences. Focus on the intentions of the code, emphasize the type of data being handled, do not give an introductory clause. do nothing else.\n\nimport pandas as pd\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n%matplotlib inline\nwh2015 = pd.read_csv('../input/2015.csv')\nwh2015.info()\nwh2016 = pd.read_csv('../input/2016.csv')\nwh2016.info()\n#max happiness score for 2015\nwh2015[wh2015['Happiness Score'] == wh2015['Happiness Score'].max()][['Country','Happiness Score', 'Happiness Rank']]\n#max happiness score for 2015\nwh2016[wh2016['Happiness Score'] == wh2016['Happiness Score'].max()][['Country','Happiness Score', 'Happiness Rank']]\n#min happiness score for 2015\nwh2015[wh2015['Happiness Score'] == wh2015['Happiness Score'].min()][['Country','Happiness Score', 'Happiness Rank']]\n#min happiness score for 2016\nwh2016[wh2016['Happiness Score'] == wh2016['Happiness Score'].min()][['Country','Happiness Score', 'Happiness Rank

In [14]:
df_test['question'][2]

'Summarize the following code in two to three sentences. Focus on the intentions of the code, emphasize the type of data being handled, do not give an introductory clause. do nothing else.\n\n# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n# For example, here\'s several helpful packages to load in \n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\nimport scipy\n\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import log_loss\nfrom sklearn.model_selection import train_test_split\n\n# Input data files are available in the "../input/" directory.\n# For example, running this (by clicking run or pressi

In [15]:
df_test['answer'][3]

'The code is focused on cleaning and preprocessing the data in the "houseprice" dataset, specifically replacing missing values in the training data. The data being handled is a dataset of house prices, with various features such as LotFrontage, Alley, MasVnrType, and others. The goal is to prepare the data for further exploration and feature engineering, with the intention of building a machine learning model.'

In [16]:
df_test.to_parquet('data_cleaned.parquet', engine='pyarrow')


In [17]:
# df_clean = pd.read_parquet('data_cleaned.parquet', engine='pyarrow')