In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from collections import defaultdict

# Load data
df = pd.read_csv("final_flat_dataset.csv")

# Filter and deduplicate
df_expanded = df[(df['drug'].notna()) & (df['drug'] != "") & (df['diagnosis'].notna())]
df_expanded = df_expanded[['diagnosis', 'drug']].drop_duplicates()

# Create binary matrix
binary_df = df_expanded.copy()
binary_df['value'] = 1
binary_df = binary_df.pivot_table(index='diagnosis', columns='drug', values='value', fill_value=0)

# Prepare data for surprise
binary_long = binary_df.reset_index().melt(id_vars='diagnosis', var_name='drug', value_name='value')
binary_long = binary_long[binary_long['value'] > 0]

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(binary_long[['diagnosis', 'drug', 'value']], reader)

# Train/test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=123)

# IBCF (item-based collaborative filtering)
sim_options = {
    "name": "cosine",
    "user_based": False  # Item-based
}

model = KNNBasic(sim_options=sim_options)
model.fit(trainset)

# Generate top-N predictions
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = [iid for (iid, _) in user_ratings[:n]]
    return top_n

predictions = model.test(testset)
topn_preds = {n: get_top_n(predictions, n=n) for n in [1, 3, 5, 10]}

# Evaluate: binary relevance metrics
def calc_confusion(preds_dict, true_items_dict, all_items):
    metrics = []
    for n, preds in preds_dict.items():
        TP = FP = FN = TN = 0
        for uid, preds_n in preds.items():
            true_items = true_items_dict.get(uid, [])
            for item in all_items:
                if item in preds_n and item in true_items:
                    TP += 1
                elif item in preds_n and item not in true_items:
                    FP += 1
                elif item not in preds_n and item in true_items:
                    FN += 1
                else:
                    TN += 1
        precision = TP / (TP + FP) if (TP + FP) else 0
        recall = TP / (TP + FN) if (TP + FN) else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
        fpr = FP / (FP + TN) if (FP + TN) else 0
        metrics.append({'n': n, 'TP': TP, 'FP': FP, 'FN': FN, 'TN': TN,
                        'precision': precision, 'recall': recall, 'F1': f1,
                        'FPR': fpr, 'TPR': recall})
    return pd.DataFrame(metrics)

# Prepare ground truth for evaluation
true_items_dict = defaultdict(list)
for uid, iid, true_r in testset:
    if true_r > 0:
        true_items_dict[uid].append(iid)

all_items = set(df_expanded['drug'])

df_metrics = calc_confusion(topn_preds, true_items_dict, all_items)
df_metrics.to_csv("ibcf_summary_metrics.csv", index=False)

print("✅ CF model, similarity matrix, and binary matrix exported successfully.")
print(df_metrics)

Computing the cosine similarity matrix...
Done computing similarity matrix.
✅ CF model, similarity matrix, and binary matrix exported successfully.
    n   TP  FP   FN     TN  precision    recall        F1  FPR       TPR
0   1   67   0  587  39010        1.0  0.102446  0.185853  0.0  0.102446
1   3  195   0  459  39010        1.0  0.298165  0.459364  0.0  0.298165
2   5  301   0  353  39010        1.0  0.460245  0.630366  0.0  0.460245
3  10  482   0  172  39010        1.0  0.737003  0.848592  0.0  0.737003


In [3]:
import pickle

# Save
with open("knn_ibcf_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Load later in app.py
# with open("knn_ibcf_model.pkl", "rb") as f:
#     model = pickle.load(f)

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load your matrix
cf_matrix = pd.read_csv("cf_diagnosis_drug_matrix.csv", index_col=0)

# Compute cosine similarity between drugs (item-based CF)
similarity_matrix = cosine_similarity(cf_matrix.T)

# Save as .npy (efficient for large arrays)
np.save("cf_similarity_matrix.npy", similarity_matrix)

# Optional: Save a readable CSV too
sim_df = pd.DataFrame(similarity_matrix, index=cf_matrix.columns, columns=cf_matrix.columns)
sim_df.to_csv("cf_similarity_matrix.csv")

print("✅ Saved: cf_similarity_matrix.npy and cf_similarity_matrix.csv")

✅ Saved: cf_similarity_matrix.npy and cf_similarity_matrix.csv
