# KYC Risk Classification using LLaMA Model
This notebook fine-tunes a LLaMA model on KYC data and uploads it to Hugging Face Hub. You can then use the model for classification tasks in workflows like Airflow.

In [None]:
# Install necessary libraries
!pip install transformers datasets scikit-learn torch huggingface-hub beautifulsoup4 requests

## Step 1: Load the Dataset and Prepare Training Data

In [None]:
from transformers import LlamaForSequenceClassification, LlamaTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
from huggingface_hub import HfApi, upload_folder

# Sample data
data = {
    "text": [
        "ABC Corp is a business in the healthcare sector. No issues detected.",
        "XYZ Inc operates in the cannabis sector with past lawsuits.",
        "DEF Corp in the insurance sector, low income, low credit score."
    ],
    "label": ["Approved", "Rejected", "Review Required"]
}
df = pd.DataFrame(data)

# Encode labels
label_map = {'Approved': 0, 'Review Required': 1, 'Rejected': 2}
df['label'] = df['label'].map(label_map)

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

# Tokenize input
tokenizer = LlamaTokenizer.from_pretrained('meta-llama/Llama-3-7b')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'labels': train_labels})
val_dataset = Dataset.from_dict({'input_ids': val_encodings['input_ids'], 'labels': val_labels})

## Step 2: Fine-Tuning LLaMA Model

In [None]:
model = LlamaForSequenceClassification.from_pretrained('meta-llama/Llama-3-7b', num_labels=3)

training_args = TrainingArguments(
    output_dir="./llama_kyc_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

## Step 3: Save and Upload the Model to Hugging Face

In [None]:
# Save locally
model.save_pretrained('./llama_kyc_model')
tokenizer.save_pretrained('./llama_kyc_model')

# Upload to Hugging Face
api = HfApi()
api.create_repo(repo_id="your-huggingface-username/llama-kyc-classifier", private=False)
upload_folder(repo_id="your-huggingface-username/llama-kyc-classifier", folder_path="./llama_kyc_model")

print("Model uploaded successfully!")