# CodeLlama Fine-tuning Exploration

This notebook demonstrates how to explore the dataset and experiment with the fine-tuned CodeLlama model.

In [None]:
# Import necessary libraries
import json
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import matplotlib.pyplot as plt
import seaborn as sns

## Dataset Exploration

In [None]:
# Load the dataset
dataset = load_dataset("samaxr/code-summary-java")
print(f"Dataset splits: {dataset.keys()}")
print(f"Training examples: {len(dataset['train'])}")
print(f"Validation examples: {len(dataset['validation'])}")
print(f"Test examples: {len(dataset['test'])}")

In [None]:
# Examine a few examples
for i in range(3):
    example = dataset['train'][i]
    print(f"Example {i+1}:")
    print(f"Code: {example['code'][:200]}...")
    print(f"Summary: {example['summary']}")
    print("-" * 50)

In [None]:
# Analyze code and summary lengths
train_data = dataset['train']
code_lengths = [len(example['code']) for example in train_data]
summary_lengths = [len(example['summary']) for example in train_data]

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(code_lengths, bins=50, alpha=0.7)
plt.title('Distribution of Code Lengths')
plt.xlabel('Code Length (characters)')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(summary_lengths, bins=50, alpha=0.7)
plt.title('Distribution of Summary Lengths')
plt.xlabel('Summary Length (characters)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Model Loading and Testing

In [None]:
# Load the base CodeLlama model for comparison
model_name = "codellama/CodeLlama-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Test code generation with base model
def generate_code(model, tokenizer, prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with a simple prompt
test_prompt = "public class Calculator {"
generated = generate_code(base_model, tokenizer, test_prompt)
print(f"Prompt: {test_prompt}")
print(f"Generated: {generated}")

## Dataset Statistics

In [None]:
# Create a DataFrame for easier analysis
df = pd.DataFrame({
    'code': [example['code'] for example in train_data[:1000]],  # Sample for performance
    'summary': [example['summary'] for example in train_data[:1000]]
})

df['code_length'] = df['code'].str.len()
df['summary_length'] = df['summary'].str.len()
df['code_lines'] = df['code'].str.count('\n') + 1

print("Dataset Statistics:")
print(df[['code_length', 'summary_length', 'code_lines']].describe())

In [None]:
# Correlation analysis
correlation_matrix = df[['code_length', 'summary_length', 'code_lines']].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix: Code and Summary Metrics')
plt.show()

## Fine-tuning Preparation

In [None]:
# Analyze tokenization
sample_codes = [example['code'] for example in train_data[:100]]
sample_summaries = [example['summary'] for example in train_data[:100]]

# Tokenize samples
code_tokens = [len(tokenizer.encode(code)) for code in sample_codes]
summary_tokens = [len(tokenizer.encode(summary)) for summary in sample_summaries]

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(code_tokens, bins=20, alpha=0.7)
plt.title('Distribution of Code Token Counts')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(summary_tokens, bins=20, alpha=0.7)
plt.title('Distribution of Summary Token Counts')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"Average code tokens: {sum(code_tokens)/len(code_tokens):.2f}")
print(f"Average summary tokens: {sum(summary_tokens)/len(summary_tokens):.2f}")
print(f"Max code tokens: {max(code_tokens)}")
print(f"Max summary tokens: {max(summary_tokens)}")