In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Load a small open-source LLM
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# model_name = "deepseek-ai/deepseek-coder"
# model_name = "Phind/Phind-CodeLlama-34B"

# model_name = "codellama/CodeLlama-13b-Instruct-hf"
# model_name = "mistralai/Mistral-7B-v0.1"

In [10]:

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    offload_folder="offload",  # add this line
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

Some parameters are on the meta device because they were offloaded to the disk and cpu.


In [12]:
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map="auto",
#     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
# )

In [13]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [14]:
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
# )


In [15]:
def load_code(filename):
    with open(filename, 'r') as f:
        return f.read()

In [16]:
code = load_code("data_engineering.py")

In [17]:
prompt = f"""
You are a Python code checker.

Your task is to:
1. Check the given Python code for any syntax errors.
2. Identify bad coding practices or anti-patterns.
3. Detect any logical flaws or unintended behavior.

If any issues are found, provide a corrected and improved version of the code.

Here is the code to check:
```python```
{code} """


In [18]:
prompt

'\nYou are a Python code checker.\n\nYour task is to:\n1. Check the given Python code for any syntax errors.\n2. Identify bad coding practices or anti-patterns.\n3. Detect any logical flaws or unintended behavior.\n\nIf any issues are found, provide a corrected and improved version of the code.\n\nHere is the code to check:\n```python```\n# data_engineering.py\n\nimport pandas as pd\nimport boto3\nimport os\n\ndef load_csv_from_s3(bucket, file_key):\n    s3 = boto3.client(\'s3\')\n    obj = s3.get_object(Bucket=bucket, Key=file_key)\n    df = pd.read_csv(obj[\'Body\'])\n    return df\n\ndef transform_data(df):\n    df[\'total_amount\'] = df[\'quantity\'] * df[\'price\']\n    df[\'order_date\'] = pd.to_datetime(df[\'order_date\'], errors=\'coerce\')\n    df[\'year\'] = df[\'order_date\'].dt.year\n    df[\'month\'] = df.order_date.dt.month\n    df[\'day\'] = df.order_date.dt.day\n    return df\n\ndef save_to_local(df, path):\n    df.to_csv(path, index=False)\n    print(f"Data saved to {p

In [19]:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

In [20]:
# Generate response
outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [21]:
response = tokenizer.decode(outputs[0], skip_special_tokens=True)


In [22]:
print(response)


You are a Python code checker.

Your task is to:
1. Check the given Python code for any syntax errors.
2. Identify bad coding practices or anti-patterns.
3. Detect any logical flaws or unintended behavior.

If any issues are found, provide a corrected and improved version of the code.

Here is the code to check:
```python```
# data_engineering.py

import pandas as pd
import boto3
import os

def load_csv_from_s3(bucket, file_key):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=file_key)
    df = pd.read_csv(obj['Body'])
    return df

def transform_data(df):
    df['total_amount'] = df['quantity'] * df['price']
    df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')
    df['year'] = df['order_date'].dt.year
    df['month'] = df.order_date.dt.month
    df['day'] = df.order_date.dt.day
    return df

def save_to_local(df, path):
    df.to_csv(path, index=False)
    print(f"Data saved to {path}")

if __name__ == '__main__':
    BUCKET_NAME = 'my-bu