In [1]:
!pip install transformers torch peft datasets pandas scikit-learn matplotlib requests -q

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import requests
import re

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  #j'en ai plus je crois
print(f"Using device: {device}")

Using device: cuda


In [3]:
# URLs for the dataset files
base_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module8/exercise/'

train_url = base_url + 'train.csv'
test_url = base_url + 'test.csv'

def download_file(url, filename):
    """Download a file from URL."""
    response = requests.get(url)
    response.raise_for_status()
    with open(filename, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded {filename}")

# Download files
download_file(train_url, 'train.csv')
download_file(test_url, 'test.csv')

Downloaded train.csv
Downloaded test.csv


In [4]:
# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

print(f"Train set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")

# Display category distribution
print("\nTraining set category distribution:")
print(train_data['category'].value_counts().sort_index())

print("\nSample training problems:")
print(train_data.head(10))

Train set size: 900
Test set size: 100

Training set category distribution:
category
algebra          150
arithmetic       153
fractions        143
geometry         155
percentage       152
word_problems    147
Name: count, dtype: int64

Sample training problems:
   id       category                                            problem  \
0   0     percentage                                Increase 109 by 25%   
1   1     arithmetic                                   What is 76 + 55?   
2   2  word_problems  Sarah has $286. She spends $128. How much mone...   
3   3       geometry  What is the circumference of a circle with rad...   
4   4       geometry   What is the volume of a cube with side length 3?   
5   5     percentage                                 What is 7% of 132?   
6   6  word_problems  John is 10 years old now. How old was he 15 ye...   
7   7      fractions                  What is 1/5 + 2/5? (decimal form)   
8   8     percentage                                What is 2

In [5]:
#la c pour la baseline

#précision avec une certaine tolérance
def check_accuracy(predictions, ground_truth, tolerance=0.01):
    """
    Calculate accuracy with tolerance for floating point comparisons.

    Two values are considered equal if their difference is <= tolerance
    OR if they round to the same value at 2 decimal places.
    """
    correct = 0
    for pred, truth in zip(predictions, ground_truth):
        if round(pred, 2) == round(truth, 2):
            correct += 1
        elif abs(pred - truth) <= tolerance:
            correct += 1
    return correct / len(predictions)

#donc la baseline c de prendre la moyenne des solutions du train
mean_solution = train_data['solution'].mean()
print(f"Dummy model : {mean_solution:.2f}")

#par curiosité je regarde ce que ça donne sur train
baseline_preds = [mean_solution] * len(train_data)
baseline_acc = check_accuracy(baseline_preds, train_data['solution'].tolist())
print(f"precision de baseline sur train: {baseline_acc:.2%}")

Dummy model : 150.79
precision de baseline sur train: 0.00%


In [6]:
#pour extraire les nombres
def extract_number_last(text):
    matches = re.findall(r"-?\d+\.?\d*", text)
    if matches:
        try:
            return float(matches[-1])
        except:
            return None
    return None

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM



if 'device' not in globals():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "Qwen/Qwen2.5-Math-1.5B" #apparemment ça marche bien

print(f"Loading {model_name} in Float16 (FP16)...")

tokenizer = AutoTokenizer.from_pretrained(model_name)


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,

    device_map="auto"
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"{model_name} loaded")
print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M params")

Loading Qwen/Qwen2.5-Math-1.5B in Float16 (FP16)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Qwen/Qwen2.5-Math-1.5B loaded
Model size: 1543.7M params


In [8]:
import re

def extract_number_last(text):
    matches = re.findall(r"-?\d+\.?\d*", text)
    if matches:
        try:
            return float(matches[-1])
        except:
            return None
    return None

def extract_number_final_tag(text):         #surement plus explicite donc meilleure précision
    match = re.search(r"###FINALE###\s*(-?\d+\.?\d*)", text)
    if match:
        try:
            return float(match.group(1))
        except:
            return None
    return extract_number_last(text)

extract_number = extract_number_final_tag


def generate_answer(problem, prompt_template="simple", max_new_tokens=100, temperature=0.1):
    max_context_length = 1024

    if prompt_template == "simple":
        prompt = f"{problem}\nAnswer:"

    elif prompt_template == "instruction":
        prompt = f"Solve this math problem and provide only the numerical answer, rounded to 2 decimal places.\n\nProblem: {problem}\nAnswer:"

    elif prompt_template == "cot":
        prompt = (
            f"Solve this math problem step by step. Detail the 'Solution' with all calculation steps.\n"
            f"After your detailed solution, provide the final answer as a single number preceded by the tag '###FINALE###' and rounded to 2 decimal places.\n\n"
            f"Problem: {problem}\n"
            f"Solution:\n"
        )

    elif prompt_template == "few_shot":
        examples = []
        for i in range(min(3, len(train_data))):
            example_problem = train_data['problem'].iloc[i]
            example_solution = train_data['solution'].iloc[i]

            cot_example = (
                f"Problem: {example_problem}\n"
                f"Solution:\n"
                f"1. Analyse the problem.\n"
                f"2. Calculate: [Détails de calcul...]\n"
                f"###FINALE### {example_solution}"
            )
            examples.append(cot_example)

        examples_text = "\n\n".join(examples)

        prompt = f"{examples_text}\n\nSolve the next math problem step by step.\n\nProblem: {problem}\nSolution:\n"

    else:
        prompt = problem

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_context_length).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.1,
            do_sample=True if 0.1 > 0 else False,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if response.startswith(prompt):
        response = response[len(prompt):].strip()

    return response

test_problem = train_data['problem'].iloc[0]
test_solution = train_data['solution'].iloc[0]

print(f"\nTesting problem: {test_problem}")
print(f"Correct answer: {test_solution}\n")
print("="*70)

for template in ["simple", "instruction", "cot", "few_shot"]:
    response = generate_answer(test_problem, template)
    extracted = extract_number(response)

    correct = "✓" if extracted is not None and (round(extracted, 2) == round(test_solution, 2) or abs(extracted - test_solution) <= 0.01) else "✗"

    print(f"{correct} {template}:")
    print(f"  Response: {response[:100]}{'...' if len(response) > 100 else ''}")
    print(f"  Extracted: {extracted}\n")

print("="*70)


Testing problem: Increase 109 by 25%
Correct answer: 136.25

✓ simple:
  Response: 130.75

Increase 109 by 25%
To increase 109 by 25%, we need to find 25% of 109 and then add it to 10...
  Extracted: 136.25

✗ instruction:
  Response: ___________
To solve the problem of increasing 109 by 25%, we can follow these steps:

1. Calculate ...
  Extracted: 0.25

✗ cot:
  Response: 1. Convert 25% to a decimal: 25% = 0.25
2. Multiply 109 by 0.25 to find the increase: 109 * 0.25 = 2...
  Extracted: 136.2

✓ few_shot:
  Response: 1. Analyse the problem.
2. Calculate: [Détails de calcul...]
###FINALE### 136.25
  Extracted: 136.25



In [9]:
#few shot a l'air pas mal
best_template = "few_shot"

val_data = train_data.tail(50).copy()
predictions = []
ground_truth = val_data['solution'].tolist()
print(f"Evaluating on {len(val_data)} validation problems using {best_template}...\n")

for idx, row in val_data.iterrows():
    problem = row['problem']

    response = generate_answer(problem, prompt_template=best_template)
    prediction = extract_number(response)

    if prediction is None:
        prediction = 0.0

    predictions.append(prediction)

    if (len(predictions) % 10) == 0:
        print(f"Processed {len(predictions)}/{len(val_data)} problems...")

accuracy = check_accuracy(predictions, ground_truth)
print(f"\nValidation Accuracy: {accuracy:.2%}")


Evaluating on 50 validation problems using few_shot...

Processed 10/50 problems...
Processed 20/50 problems...
Processed 30/50 problems...
Processed 40/50 problems...
Processed 50/50 problems...

Validation Accuracy: 90.00%


In [10]:
#nickel !! le pb ct l'extraction

In [13]:
import pandas as pd

best_template = "few_shot"

print(f"Part 8: Génération des prédictions sur {len(test_data)} problèmes de test en utilisant '{best_template}'...\n")
test_predictions = []

for idx, row in test_data.iterrows():
    problem = row['problem']

    response = generate_answer(problem, prompt_template=best_template)
    prediction = extract_number(response)

    if prediction is None:
        prediction = 0.0
        print(f"⚠️  Warning: No number extracted for problem {idx}: {problem[:50]}...")

    test_predictions.append(prediction)

    if (idx + 1) % 10 == 0:
        print(f"Processed {idx + 1}/{len(test_data)} problems...")

print("\nAll test predictions generated!")

# Création DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],
    'solution': test_predictions
})

# je corrige j'avais oublié d'arrondir!!!!!!!!!
submission['solution'] = submission['solution'].round(2)

# Sauvegarde CSV avec format strict 2 décimales
submission.to_csv('submission.csv', index=False, float_format="%.2f")
print("\nSubmission file created: submission.csv")

print("\nSubmission preview:")
print(submission.head(10))

# Vérification
non_numeric = submission['solution'].isna().sum()
if non_numeric > 0:
    print(f"\n⚠️  WARNING: {non_numeric} predictions are not numerical!")
    print("These will result in incorrect answers. Please fix them.")
else:
    print("\n✓ All predictions are numerical")

print("\nPrediction statistics:")
print(submission['solution'].describe())


Part 8: Génération des prédictions sur 100 problèmes de test en utilisant 'few_shot'...

Processed 10/100 problems...
Processed 20/100 problems...
Processed 30/100 problems...
Processed 40/100 problems...
Processed 50/100 problems...
Processed 60/100 problems...
Processed 70/100 problems...
Processed 80/100 problems...
Processed 90/100 problems...
Processed 100/100 problems...

All test predictions generated!

Submission file created: submission.csv

Submission preview:
   id  solution
0   0     98.10
1   1    314.00
2   2    224.00
3   3     96.50
4   4    102.00
5   5     91.20
6   6     69.44
7   7    400.00
8   8    560.00
9   9    295.50

✓ All predictions are numerical

Prediction statistics:
count     100.000000
mean       94.097800
std       162.716112
min         0.500000
25%         7.000000
50%        39.500000
75%       113.087500
max      1133.560000
Name: solution, dtype: float64


In [14]:
#du coup j'avais testé avec tiny ct une CATASTROPHE, genre 22%
#j'ai testé avec deep seek mais avec une fonction d'extraction qui prenait le dernier nombre, ct pas fou genre 58% (et avec cot), j'avais testé puis supp sur ce notebook et g la flemme de refaire
#finalement j'ai pris une fonction d'extraction qui recherche un marqueur de position et retourne le nb qui le suit, et la sur l'entrainenemnt j'ai eu 90
#aussi j'ai du faire des print pour voir que ct l'extraction le pb mais pareil j'ai du supp et je peux pas refaire j'aurais pas assez de gpu je pense