In [10]:
# ============================================================
# Agentic Reasoning System - Hybrid Symbolic + ML Pipeline
# ============================================================
# Requirements:
# pip install pandas numpy scikit-learn sentence-transformers joblib matplotlib

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sentence_transformers import SentenceTransformer, util
import joblib
import matplotlib.pyplot as plt

# ============================================================
# 1. Load Data
# ============================================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# assume columns: problem, option1, option2, option3, option4, correct
# train = train.dropna(subset=["problem"]) # Removed the dropna line

# ============================================================
# 2. Helper Functions
# ============================================================
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9+\-*/%()., ]", " ", text) # Fixed escape sequence
    return re.sub(r"\s+", " ", text).strip()

def is_math_question(q):
    return bool(re.search(r"\d+|percent|ratio|speed|distance|time|sum|average|mean|profit|loss|cost|simple|interest", q))

def symbolic_solver(question):
    """Evaluate simple arithmetic patterns."""
    try:
        expr = re.findall(r"[0-9+\-*/().% ]+", question) # Fixed escape sequence
        if expr:
            return eval(expr[0])
    except:
        return None
    return None

# ============================================================
# 3. Feature Preparation
# ============================================================
train["text"] = train["problem_statement"].apply(lambda x: clean_text(x)) # Use problem_statement
test["text"] = test["problem_statement"].apply(lambda x: clean_text(x)) # Use problem_statement

# join question + options
option_cols = [col for col in train.columns if 'answer_option' in col]
train["combined"] = train["text"] + " " + train[option_cols].astype(str).agg(' '.join, axis=1) # Use detected option columns and convert to string
test["combined"] = test["text"] + " " + test[option_cols].astype(str).agg(' '.join, axis=1) # Use detected option columns and convert to string

# ============================================================
# 4. TF-IDF + Logistic Regression (baseline)
# ============================================================
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train = tfidf.fit_transform(train["combined"])
y_train = train["correct_option_number"] # Use correct_option_number

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

X_test = tfidf.transform(test["combined"])
ml_preds = model.predict(X_test)

# ============================================================
# 5. Semantic Similarity Layer (Sentence Transformer)
# ============================================================
encoder = SentenceTransformer("all-MiniLM-L6-v2")

def semantic_reasoning(question, options):
    q_emb = encoder.encode(question, convert_to_tensor=True)
    opts_emb = encoder.encode(options, convert_to_tensor=True)
    sims = util.cos_sim(q_emb, opts_emb).cpu().numpy()[0]
    return np.argmax(sims) + 1  # option1..4 → 1..4

semantic_preds = []
for _, row in test.iterrows():
    options = [row[col] for col in option_cols]
    semantic_preds.append(semantic_reasoning(row["problem_statement"], options)) # Use problem_statement

# ============================================================
# 6. Rule-based Symbolic Reasoning Layer
# ============================================================
symbolic_preds = []
for _, row in test.iterrows():
    q = row["problem_statement"].lower() # Use problem_statement
    if is_math_question(q):
        try:
            val = symbolic_solver(q)
            if val is not None:
                # choose numeric option closest to result
                opts = []
                for col in option_cols:
                    o = str(row[col]) # Convert to string
                    m = re.sub("[^0-9.\-\\+]", "", o) # Fixed escape sequence and added + for positive numbers
                    opts.append(float(m) if m and m != '-' and m != '+' else 0) # Handle empty string or just '-' or '+'
                idx = int(np.argmin([abs(v - val) for v in opts])) + 1
                symbolic_preds.append(idx)
                continue
        except:
            pass
    symbolic_preds.append(semantic_preds[len(symbolic_preds)])  # fallback

# ============================================================
# 7. Ensemble Voting
# ============================================================
ensemble_preds = []
for i in range(len(test)):
    votes = [ml_preds[i], semantic_preds[i], symbolic_preds[i]]
    final = max(set(votes), key=votes.count)
    ensemble_preds.append(final)

# ============================================================
# 8. Evaluate (if ground truth available)
# ============================================================
if "correct_option_number" in test.columns:
    y_true = test["correct_option_number"]
    f1 = f1_score(y_true, ensemble_preds, average="macro")
    print(f"✅ Ensemble F1 Score: {f1:.4f}")

# ============================================================
# 9. Save Outputs
# ============================================================
out = pd.DataFrame({
    "id": test.index,
    "predicted_option": ensemble_preds
})
out.to_csv("output.csv", index=False)
print("Saved: output.csv")

# save models
joblib.dump(model, "tfidf_model.joblib")
joblib.dump(tfidf, "tfidf_vectorizer.joblib")

# ============================================================
# 10. Pie Chart for Report
# ============================================================
labels = ["TF-IDF ML", "Semantic", "Symbolic"]
sizes = [len([1 for i in range(len(test)) if ensemble_preds[i]==ml_preds[i]]),
         len([1 for i in range(len(test)) if ensemble_preds[i]==semantic_preds[i]]),
         len([1 for i in range(len(test)) if ensemble_preds[i]==symbolic_preds[i]])]

plt.figure(figsize=(5,5))
plt.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=140)
plt.title("Contribution of Reasoning Components")
plt.savefig("component_pie_chart.png", dpi=200)
plt.close()
print("Saved: component_pie_chart.png")

print("✅ Agentic Reasoning System Complete")

  m = re.sub("[^0-9.\-\\+]", "", o) # Fixed escape sequence and added + for positive numbers


Saved: output.csv
Saved: component_pie_chart.png
✅ Agentic Reasoning System Complete
