In [None]:
import csv
import json
import os

def csv_to_json(csv_files, output_dir):
    for csv_file in csv_files:
        json_file = os.path.join(output_dir, os.path.splitext(os.path.basename(csv_file))[0] + '.json')
        data = []
    
    with open(csv_file, 'r') as csvfile:
            csvreader = csv.DictReader(csvfile)
            
            for row in csvreader:
                data.append({'instruction': row['instruction'], 'output': row['output']})

        json_data = json.dumps(data, indent=4)

        with open(json_file, 'w') as jsonfile:
            jsonfile.write(json_data)

csv_directory = 'Data'
output_directory = 'Data/json'
csv_files = [os.path.join(csv_directory, file) for file in os.listdir(csv_directory) if file.endswith('.csv')]
os.makedirs(output_directory, exist_ok=True)
csv_to_json(csv_files, output_directory)

In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments 
from trl import SFTTrainer 
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct")
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct")

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

def convert_item(item):
    return {'text': item['prompt'] + '\n' + item['output']}

def tokenize_function(examples):
    return tokenizer(examples["text"], return_tensors='pt', padding='max_length', truncation=True, max_length=512)

def add_labels_column(example):
    example['labels'] = example['input_ids']
    return example


json_dir = "Data/json"
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        print(f'training model on: {filename}')
        with open(os.path.join(json_dir, filename), "r") as f:
            code = json.load(f)
            EOS_TOKEN = "<|EOT|>"
            outputs = ["### Output" + row['output'] + EOS_TOKEN for row in code]
            prompts = ["### Instruction" + row['instruction'] for row in code]
            dataset = [{"prompt":s, "output":t, "example": s+t} for s, t in zip(prompts, outputs)]

            
            converted_dataset = [convert_item(item) for item in dataset]
            dataset = Dataset.from_dict({'text': [item['text'] for item in converted_dataset]})
            
            dataset_dict_train = DatasetDict({'train': dataset}) 
            
            dataset_dict_copy = DatasetDict({'train': dataset}).copy()
            
            tokenized_datasets_train = dataset_dict_train.map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])
            
            lm_datasets_train = tokenized_datasets_train.map(
                      add_labels_column, 
                      batched=True,
                      batch_size=1000,
                      num_proc=4,
                        )

            target_dataset = lm_datasets_train['train']
            text_column_values = dataset_dict_copy['train']['text']
            target_dataset = target_dataset.add_column('text', text_column_values)
            target_dataset.set_format(type=target_dataset.format["type"], columns=['input_ids', 'attention_mask', 'labels', 'text'])
            
            
            def formatting_prompts_func(example):
                output_texts = []
                for i in range(len(example['prompt'])):
                    text = f"### Instruction: {example['prompt'][i]}\n ### Output: {example['output'][i]}"
                    output_texts.append(text)
                return output_texts
            
            
            training_args = TrainingArguments(
                f"{filename}", 
                per_device_train_batch_size=4,
                learning_rate=2e-5,
                weight_decay=0.01,
                lr_scheduler_type="cosine",
                num_train_epochs=3,
                evaluation_strategy="no", 
                save_strategy="steps", 
                save_steps=100,
                fp16=True, 
                optim="adamw_torch", 
                 )
            
            trainer = SFTTrainer(
               model,
               train_dataset=target_dataset,
               packing=True, # pack samples together for efficient training
               max_seq_length=256,  
               args=training_args,
               dataset_text_field='text', 
               formatting_func=formatting_prompts_func, # format samples with a model schema
               )
    
            trainer.train()
            trainer.save_model(f"new_models/{filename}")
        
        