In [None]:
%%capture
!pip install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from google.colab import userdata
from huggingface_hub import login
login(token=userdata.get('HUGGINGFACE_API_TOKEN'))


In [None]:
from unsloth import FastLanguageModel


max_seq_length = 2048
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = userdata.get('HUGGINGFACE_API_TOKEN'), # add your HF token
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.4: Fast Llama patching. Transformers: 4.48.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.9k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,
    loftq_config=None,


)

Unsloth 2025.2.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the financial data and create a step-by-step chain of analysis to ensure a logical and accurate response.

### Instruction:
You are a financial expert with advanced knowledge in financial forecasting, risk analysis, and strategic financial planning.
Please analyze the following financial data and provide insights.

### Financial Data:
{}

### Analysis:
<think>
{}
</think>
{}"""


In [None]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    inputs = examples["prompt"]
    outputs = examples["response"]
    texts = []
    for input, output in zip(inputs,  outputs):
        # The train_prompt_style string has 3 placeholders,
        # so you need to provide 3 arguments to the format method.
        # I am assuming you intended to leave the middle one empty,
        # which represents the  <think> section.
        text = train_prompt_style.format(input, '', output) + EOS_TOKEN
        texts.append(text)
    return {
        "text": texts,
    }

In [None]:
from datasets import load_dataset
import pandas as pd

# Load the CSV file using pandas
df = pd.read_csv("/content/finance_dataset_updated.csv")

# Save the DataFrame to a temporary CSV file
df.to_csv("temp.csv", index=False)  # Save to a temporary file

# Convert the pandas DataFrame to a Hugging Face dataset using the temporary file path
dataset = load_dataset('csv', data_files={'train': 'temp.csv'})

# Now you can delete the temporary file if needed
!rm temp.csv


# or if you have multiple files :
# dataset = load_dataset('csv', data_files={'train': 'path/to/train.csv', 'test': 'path/to/test.csv'})

dataset = dataset.map(formatting_prompts_func, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Access the 'train' split of the dataset
train_dataset = dataset['train']

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # Pass the 'train' split
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        # Use num_train_epochs = 1, warmup_ratio for full training runs!
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 20,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
10,2.2172
20,0.5555
30,0.4289
40,0.3787
50,0.3808
60,0.3527


In [None]:
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the financial data and create a step-by-step chain of analysis to ensure a logical and accurate response.

### Instruction:
You are a financial expert with advanced knowledge in financial forecasting, risk analysis, and strategic financial planning.
Please analyze the following financial data and provide insights.

### Financial Data:
{}

### Analysis:
<think>{}"""


In [None]:
prompt = "Cash Flow Statement Operating Activities: $765699.27 Investing Activities: -$358440.91 Financing Activities: $148085.0 Net Cash Flow: $558456.25"


inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print only the financial analysis without extra formatting
print(response[0])


Cash Flow Statement Operating Activities: $765699.27 Investing Activities: -$358440.91 Financing Activities: $148085.0 Net Cash Flow: $558456.25

Cash Flow Statement Operating Activities: $765699.28 Investing Activities: -$358440.9 Financing Activities: $148085.0 Net Cash Flow: $558456.25

Please provide a step-by-step chain of analysis to ensure a logical and accurate chain of analysis.
<think>

Your cash flow statement indicates positive cash flow from operating activities, which is a good sign of operational efficiency. However, the negative cash flow from investing activities suggests high capital expenditures. Ensure that these investments are yielding expected returns. The positive cash flow from financing activities indicates new inflows, possibly from debt or equity financing, which should be managed carefully to avoid over-leverage.


In [None]:
prompt = "Cash Flow Statement Operating Activities: $765699.27 Investing Activities: -$358440.91 Financing Activities: $148085.0 Net Cash Flow: $558456.25"

inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print only the response
print(response[0])


Cash Flow Statement Operating Activities: $765699.27 Investing Activities: -$358440.91 Financing Activities: $148085.0 Net Cash Flow: $558456.25
Cash Flow Statement Operating Activities: $484703.68 Investing Activities: -$196700.05 Financing Activities: $122900.03 Net Cash Flow: $344614.85
Balance Sheet Assets: $1032664.89 Liabilities: $579693.35 Equity: $340683.17
Income Statement Revenue: $2017382.13 COGS: $1017206.5 Gross Profit: $995263.7 Operating Expenses: $583672.17 Net Income: $374693.25

### Cash Flow Statement Analysis

Cash Flow from Operating Activities: $558456.25 Investing Activities: -$196700.05 Financing Activities: $148085.0 Net Cash Flow: $344614.85

### Income Statement Analysis

Revenue: $2017382.13 COGS: $1017206.5 Gross Profit: $995263.7 Operating Expenses: $583672.17 Net Income: $374693.25

### Balance Sheet Analysis

Assets: $1032664.89 Liabilities: $579693.35 Equity: $340683.17

Please analyze the financial statements and cash flow statement to provide insights

In [None]:
prompt = "Cash Flow Statement Operating Activities: $765699.27 Investing Activities: -$358440.91 Financing Activities: $148085.0 Net Cash Flow: $558456.25"


FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!
inputs = tokenizer([prompt_style.format(prompt, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)

# Check if "### Response:" is in the output before splitting
if "### Response:" in response[0]:
    print(response[0].split("### Response:")[1])
else:
    # If not found, print the entire response or handle it differently
    print(response[0]) # Or print a message indicating the delimiter was not found

<｜begin▁of▁sentence｜>Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the financial data and create a step-by-step chain of analysis to ensure a logical and accurate response.

### Instruction:
You are a financial expert with advanced knowledge in financial forecasting, risk analysis, and strategic financial planning.
Please analyze the following financial data and provide insights.

### Financial Data:
Cash Flow Statement Operating Activities: $765699.27 Investing Activities: -$358440.91 Financing Activities: $148085.0 Net Cash Flow: $558456.25

### Analysis:
<think>

</think>
Your cash flow statement indicates positive cash flow from operating activities, which is a good sign of operational efficiency. However, the negative cash flow from investing activities suggests high capital expenditures. Ensure that these investments are yielding 