In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load your full dataset
df = pd.read_csv("education_policies.csv")  # replace with your dataset file

# Step 2: Split into train and test (or any two parts)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Step 3: Save them as separate CSV files
train_df.to_csv("train_policies.csv", index=False)
test_df.to_csv("test_policies.csv", index=False)

print("✅ Dataset split and saved successfully!")

In [14]:
# Quantum NLP version of TF-IDF pipeline using Qiskit 2.2.2 (local simulation)

from qiskit import QuantumCircuit
from qiskit.circuit import ParameterVector
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import joblib

# -------------------- Paths --------------------
MODEL_PATH = "quantum_policy_kernel.pkl"
MATRIX_PATH = "quantum_policy_matrix.pkl"

# -------------------- Load datasets --------------------
train_df = pd.read_csv("train_policies.csv")
full_df = pd.read_csv("education_policies.csv")

# -------------------- Preprocess text --------------------
def preprocess(df):
    df = df.copy()
    df["text_for_nlp"] = (
        df["title"].astype(str) + ". " +
        df["full_text"].astype(str) + ". Stakeholders: " +
        df["stakeholders"].astype(str)
    ).str.lower()
    return df

train_df = preprocess(train_df)
full_df = preprocess(full_df)

# -------------------- Classical vectorization --------------------
max_features = 8  # small number for quantum encoding
vectorizer = TfidfVectorizer(max_features=max_features)
X_train_tfidf = vectorizer.fit_transform(train_df["text_for_nlp"]).toarray()
X_full_tfidf = vectorizer.transform(full_df["text_for_nlp"]).toarray()

# Normalize TF-IDF values to [0, π]
X_train_norm = np.pi * (X_train_tfidf / np.max(X_train_tfidf))
X_full_norm = np.pi * (X_full_tfidf / np.max(X_full_tfidf))

# -------------------- Parameterized quantum feature map --------------------
params = ParameterVector('x', max_features)
qc = QuantumCircuit(max_features)
for i in range(max_features):
    qc.ry(params[i], i)
qc.barrier()
for i in range(max_features - 1):
    qc.cx(i, i + 1)

# -------------------- Quantum kernel --------------------
quantum_kernel = FidelityQuantumKernel(feature_map=qc, enforce_psd=True)

# -------------------- Compute kernel matrix --------------------
kernel_matrix = quantum_kernel.evaluate(X_full_norm, X_full_norm)

# -------------------- Save results --------------------
joblib.dump(quantum_kernel, MODEL_PATH)
joblib.dump({"kernel_matrix": kernel_matrix, "df": full_df}, MATRIX_PATH)

print(f"✅ Quantum kernel model saved to {MODEL_PATH} and {MATRIX_PATH}")
print("Quantum similarity matrix shape:", kernel_matrix.shape)

✅ Quantum kernel model saved to quantum_policy_kernel.pkl and quantum_policy_matrix.pkl
Quantum similarity matrix shape: (500, 500)


In [15]:
import joblib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from qiskit.circuit import ParameterVector
from qiskit import QuantumCircuit
from qiskit_machine_learning.kernels import FidelityQuantumKernel

# -------------------- Load saved model and matrix --------------------
MODEL_PATH = "quantum_policy_kernel.pkl"
MATRIX_PATH = "quantum_policy_matrix.pkl"

quantum_kernel = joblib.load(MODEL_PATH)
saved_data = joblib.load(MATRIX_PATH)
full_df = saved_data["df"]
kernel_matrix_full = saved_data["kernel_matrix"]

print("✅ Loaded quantum kernel and full dataset")
print("Existing kernel matrix shape:", kernel_matrix_full.shape)

# -------------------- Test data --------------------
test_df = pd.DataFrame({
    "title": ["New Education Reform"],
    "full_text": ["Implement AI in classrooms and digital assessment."],
    "stakeholders": ["students, teachers, government"]
})

# -------------------- Preprocess text --------------------
def preprocess(df):
    df = df.copy()
    df["text_for_nlp"] = (
        df["title"].astype(str) + ". " +
        df["full_text"].astype(str) + ". Stakeholders: " +
        df["stakeholders"].astype(str)
    ).str.lower()
    return df

test_df = preprocess(test_df)

# -------------------- Classical vectorization --------------------
max_features = 8  # must match training
vectorizer = TfidfVectorizer(max_features=max_features)
# Fit on full dataset text to match training
vectorizer.fit(full_df["text_for_nlp"])

X_test_tfidf = vectorizer.transform(test_df["text_for_nlp"]).toarray()
X_full_tfidf = vectorizer.transform(full_df["text_for_nlp"]).toarray()

# Normalize to [0, π]
X_test_norm = np.pi * (X_test_tfidf / np.max(X_test_tfidf))
X_full_norm = np.pi * (X_full_tfidf / np.max(X_full_tfidf))

# -------------------- Compute similarity with full dataset --------------------
test_kernel = quantum_kernel.evaluate(X_test_norm, X_full_norm)

print("✅ Quantum similarity between test policy and full dataset:")
print(test_kernel)

✅ Loaded quantum kernel and full dataset
Existing kernel matrix shape: (500, 500)
✅ Quantum similarity between test policy and full dataset:
[[2.51554145e-04 5.77465322e-03 2.55195490e-03 1.37274161e-04
  2.78641860e-04 9.87692431e-05 4.66558336e-05 9.47823071e-04
  6.49262947e-05 7.76595023e-06 1.55017765e-03 2.04751014e-04
  1.99079496e-04 3.62503063e-03 3.48525668e-03 1.30302977e-04
  5.10248236e-04 6.18982607e-04 8.94210988e-05 7.23575562e-04
  1.82609146e-03 2.28822827e-03 5.77465322e-03 1.35709800e-04
  1.22407888e-05 9.25704257e-05 5.10248236e-04 5.10248236e-04
  3.48525668e-03 9.87692431e-05 1.03504763e-03 8.64189098e-06
  1.22407888e-05 6.15262515e-04 2.56607168e-03 2.28024379e-04
  1.22407888e-05 2.55195490e-03 9.57661432e-04 1.22407888e-05
  3.26997967e-04 3.48525668e-03 6.18982607e-04 1.01288920e-03
  7.10286362e-04 1.43085247e-03 3.18439021e-05 3.28329178e-03
  2.51686378e-04 9.02840255e-04 3.44038479e-03 2.28822827e-03
  9.95847207e-04 6.51068433e-04 9.25704257e-05 1.3022

In [16]:
import joblib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from qiskit.circuit import ParameterVector
from qiskit import QuantumCircuit
from qiskit_machine_learning.kernels import FidelityQuantumKernel

# -------------------- Paths --------------------
MODEL_PATH = "quantum_policy_kernel.pkl"
MATRIX_PATH = "quantum_policy_matrix.pkl"

# -------------------- Load trained quantum kernel --------------------
quantum_kernel = joblib.load(MODEL_PATH)
saved_data = joblib.load(MATRIX_PATH)
full_df = saved_data["df"]
kernel_matrix_full = saved_data["kernel_matrix"]

print("✅ Quantum kernel and dataset loaded.")

# -------------------- Preprocess function --------------------
def preprocess(df):
    df = df.copy()
    df["text_for_nlp"] = (
        df["title"].astype(str) + ". " +
        df["full_text"].astype(str) + ". Stakeholders: " +
        df["stakeholders"].astype(str)
    ).str.lower()
    return df

full_df = preprocess(full_df)

# -------------------- Vectorizer --------------------
max_features = 8  # must match training
vectorizer = TfidfVectorizer(max_features=max_features)
vectorizer.fit(full_df["text_for_nlp"])  # fit on training corpus

# -------------------- Function to retrieve top N similar policies --------------------
def retrieve_top_policies(query, top_n=3):
    # Preprocess query
    query_df = pd.DataFrame({"title": ["query"], "full_text": [query], "stakeholders": ["All"]})
    query_df = preprocess(query_df)
    
    # Vectorize and normalize
    X_query_tfidf = vectorizer.transform(query_df["text_for_nlp"]).toarray()
    X_query_norm = np.pi * (X_query_tfidf / np.max(X_query_tfidf))
    
    X_full_tfidf = vectorizer.transform(full_df["text_for_nlp"]).toarray()
    X_full_norm = np.pi * (X_full_tfidf / np.max(X_full_tfidf))
    
    # Compute quantum kernel similarity
    sim_scores = quantum_kernel.evaluate(X_query_norm, X_full_norm)[0]
    
    # Get top N indices
    top_indices = np.argsort(sim_scores)[::-1][:top_n]
    
    # Return top N policies
    return full_df.iloc[top_indices][["policy_id", "title", "sector", "region", "year", "impact_score"]]

# -------------------- Example usage --------------------
user_query = "teacher training programs and technology-enabled learning platforms"
top_policies = retrieve_top_policies(user_query)

print("Top 3 relevant policies for your query:")
print(top_policies)

✅ Quantum kernel and dataset loaded.
Top 3 relevant policies for your query:
    policy_id                                    title           sector  \
21      P1022  National Vocational Education Reform 86  Early Childhood   
317     P1318  National Vocational Education Reform 61       Vocational   
51      P1052     National Primary Education Reform 61       Vocational   

        region  year  impact_score  
21   Rajasthan  2024         0.497  
317     Kerala  2021         0.159  
51      Kerala  2023         0.139  


In [17]:
# -------------------- Retrieve top N policies ranked by similarity --------------------
def retrieve_top_policies_with_ranking(query, top_n=3):
    # Preprocess query
    query_df = pd.DataFrame({"title": ["query"], "full_text": [query], "stakeholders": ["All"]})
    query_df = preprocess(query_df)
    
    # Vectorize and normalize
    X_query_tfidf = vectorizer.transform(query_df["text_for_nlp"]).toarray()
    X_query_norm = np.pi * (X_query_tfidf / np.max(X_query_tfidf))
    
    X_full_tfidf = vectorizer.transform(full_df["text_for_nlp"]).toarray()
    X_full_norm = np.pi * (X_full_tfidf / np.max(X_full_tfidf))
    
    # Compute quantum kernel similarity
    sim_scores = quantum_kernel.evaluate(X_query_norm, X_full_norm)[0]
    
    # Rank policies by similarity
    ranked_indices = np.argsort(sim_scores)[::-1][:top_n]
    
    # Include similarity scores in the returned dataframe
    top_df = full_df.iloc[ranked_indices].copy()
    top_df["similarity_score"] = sim_scores[ranked_indices]
    
    return top_df

# -------------------- Example usage --------------------
user_query = "teacher training programs and technology-enabled learning platforms"
top_policies = retrieve_top_policies_with_ranking(user_query)

print("Top 3 relevant policies for your query ranked by similarity:\n")

for rank, row in enumerate(top_policies.itertuples(), start=1):
    paragraph = (
        f"Rank {rank}\n"
        f"Policy ID: {row.policy_id}\n"
        f"Title: {row.title}\n"
        f"Sector: {row.sector}, Region: {row.region}, Year: {row.year}\n"
        f"Impact Score: {row.impact_score:.3f}, Similarity Score: {row.similarity_score:.3f}\n"
        f"Summary: {row.summary}\n"
        f"Goals: {row.goals}\n"
        "------------------------------------------------------"
    )
    print(paragraph)

Top 3 relevant policies for your query ranked by similarity:

Rank 1
Policy ID: P1022
Title: National Vocational Education Reform 86
Sector: Early Childhood, Region: Rajasthan, Year: 2024
Impact Score: 0.497, Similarity Score: 0.078
Summary: Policy to enhance infrastructure and safety standards using public-private partnerships. Includes capacity-building, monitoring and community engagement.
Goals: Increase infrastructure score by 22% within 5 years and reduce disparities between rural and urban areas.
------------------------------------------------------
Rank 2
Policy ID: P1318
Title: National Vocational Education Reform 61
Sector: Vocational, Region: Kerala, Year: 2021
Impact Score: 0.159, Similarity Score: 0.078
Summary: Policy to enhance digital access and assessment quality using scholarship schemes. Includes capacity-building, monitoring and community engagement.
Goals: Increase digital access score by 8% within 2 years and reduce disparities between rural and urban areas.
----

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit.circuit import QuantumCircuit, ParameterVector

# -------------------- Load dataset --------------------
full_df = pd.read_csv("education_policies.csv")  # load your full dataset

# -------------------- Preprocess text --------------------
def preprocess(df):
    df = df.copy()
    df["text_for_nlp"] = (
        df["title"].astype(str) + ". " +
        df["full_text"].astype(str) + ". Stakeholders: " +
        df["stakeholders"].astype(str)
    ).str.lower()
    return df

full_df = preprocess(full_df)

# -------------------- Create dummy labels --------------------
# For demonstration, using 'sector' as a label
full_df['label'] = full_df['sector'].factorize()[0]  # converts sectors into integer labels

# -------------------- Vectorization --------------------
max_features = 8
vectorizer = TfidfVectorizer(max_features=max_features)
X = vectorizer.fit_transform(full_df["text_for_nlp"]).toarray()
X_norm = np.pi * (X / np.max(X))  # normalize for quantum kernel

y = full_df['label'].values

# -------------------- Train/test split --------------------
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42)

# -------------------- Load quantum kernel --------------------
quantum_kernel = joblib.load("quantum_policy_kernel.pkl")

# -------------------- Train SVM with quantum kernel --------------------
svc = SVC(kernel=quantum_kernel.evaluate)  # using quantum kernel
svc.fit(X_train, y_train)

# -------------------- Predict --------------------
y_pred = svc.predict(X_test)

# -------------------- Evaluation metrics --------------------
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {acc:.3f}")
print(f"F1-score: {f1:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print("Confusion Matrix:")
print(cm)