In [13]:
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split
import json


- Import [BigQuery](https://cloud.google.com/bigquery) to use as your data warehouse.
- Initialize the client to start interacting with the data warehouse, send SQL and retrieve data into the notebook.

In [1]:
from google.cloud import bigquery

In [4]:
bq_client=bigquery.Client.from_service_account_json("qlora-finetuning-cce64209d0bb.json")

## Stack Overflow Public Dataset

In [7]:
QUERY_ALL = """
SELECT
    *
FROM
    `bigquery-public-data.stackoverflow.posts_questions` q
"""

In [8]:
query_job = bq_client.query(QUERY_ALL)

In [6]:
for row in query_job:
    for value in row.values():
        print(value)

posts_answers
users
posts_orphaned_tag_wiki
posts_tag_wiki
stackoverflow_posts
posts_questions
comments
posts_tag_wiki_excerpt
posts_wiki_placeholder
posts_privilege_wiki
post_history
badges
post_links
tags
votes
posts_moderator_nomination


In [9]:
try:
    stack_overflow_df = query_job\
    .result()\
    .to_arrow()\
    .to_pandas()
except Exception as e:
    print('The DataFrame is too large to load into memory.', e)

The DataFrame is too large to load into memory. 403 Response too large to return. Consider specifying a destination table in your job configuration. For more details, see https://cloud.google.com/bigquery/troubleshooting-errors; reason: responseTooLarge, message: Response too large to return. Consider specifying a destination table in your job configuration. For more details, see https://cloud.google.com/bigquery/troubleshooting-errors

Location: US
Job ID: 8ab0684b-9e80-47de-8e84-b48ca95401c9



#### Joining Tables and Query Optimization

- Select questions as `input_text` (column 1), answers as `output_text` (column 2).
- Take the questions from `posts_questions` and answers from `posts_answers`.
- Join the questions and their corresponding accepted answers based on their same `unique ID`.
- Making sure the question is about `Python`, and that it `has an answer`. And the date the question was posted is on or after `2020-01-01`
- Limit as 10,000

In [10]:
QUERY = """
SELECT
    CONCAT(q.title, q.body) as input_text,
    a.body AS output_text
FROM
    `bigquery-public-data.stackoverflow.posts_questions` q
JOIN
    `bigquery-public-data.stackoverflow.posts_answers` a
ON
    q.accepted_answer_id = a.id
WHERE
    q.accepted_answer_id IS NOT NULL AND
    REGEXP_CONTAINS(q.tags, "python") AND
    a.creation_date >= "2020-01-01"
LIMIT
    10000
"""

In [11]:
query_job = bq_client.query(QUERY)

In [12]:
stack_overflow_df = query_job.result()\
                        .to_arrow()\
                        .to_pandas()

stack_overflow_df.head(2)



Unnamed: 0,input_text,output_text
0,ECS task only able to pick one message from SQ...,<p>I forgot to give an answer to that question...
1,When saving a list of LabelEncoders the classe...,<p>I suggest to avoid memory <code>id()</code>...


### Adding Instructions

- Instructions for LLMs have been shown to improve
model performance and generalization to unseen tasks.
- Wihtout the instruction, it is only question and answer. Model might not understand what to do.
- With the instructions, the model gets a guideline as to what task to perform.

In [14]:
INSTRUCTION_TEMPLATE = f"""\
Please answer the following Stackoverflow question on Python. \
Answer it like you are a developer answering Stackoverflow questions.

Stackoverflow question:
"""

In [15]:
stack_overflow_df['input_text_instruct'] = INSTRUCTION_TEMPLATE + ' '\
    + stack_overflow_df['input_text']

### Dataset for Tuning


In [17]:
train, evaluation = train_test_split(
    stack_overflow_df,
    test_size=0.2,
    random_state=42
)

In [19]:
date = datetime.datetime.now().strftime("%H:%d:%m:%Y")

In [20]:
cols = ['input_text_instruct','output_text']
tune_jsonl = train[cols].to_json(orient="records", lines=True)

In [None]:
#Versioning data
training_data_filename = f"tune_data_stack_overflow_\
                            python_qa-{date}.jsonl"

In [22]:
with open(training_data_filename, "w") as f:
    f.write(tune_jsonl)

## Transform Data as required by Lamma2

In [28]:

file_path = 'tune_data_stack_overflow_                            python_qa-12:19:01:2025 copy.jsonl'

output_file_path = 'transformed_data1.json'

# Function to transform JSON to LLaMA 2 format
def transform_to_llama_format(example):
    input_text = example['input_text_instruct']
    output_text = example['output_text']
    
    transformed_text = f'<s>[INST] {input_text.strip()} [/INST] {output_text.strip()} </s>'
    return {'text': transformed_text}

transformed_data = []

with open(file_path, 'r') as file:
    for line in file:
        try:
            example = json.loads(line)
            transformed_data.append(transform_to_llama_format(example))
        except json.JSONDecodeError as e:
            print(f"Skipping invalid line: {line.strip()} - Error: {e}")

with open(output_file_path, 'w') as output_file:
    json.dump(transformed_data, output_file, indent=4)

print(f"Transformation complete! Transformed dataset saved to '{output_file_path}'.")


Transformation complete! Transformed dataset saved to 'transformed_data1.json'.
