In [37]:
import pandas as pd

# Load dataset
df = pd.read_csv('OTC-Data.csv')

# Basic information
print("Dataset Overview:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# Check categorical columns
print("\nCategorical column details:")
for col in df.select_dtypes(include=['object']).columns:
    print(f"\n{col} unique values:\n", df[col].unique())


Dataset Overview:
                                            Best OTC  \
0  Among those OTC products you just rated above,...   
1                                Electric stimulator   
2                            Acetaminophen (Tylenol)   
3                  Ibuprofen (Advil, Motrin, Nuprin)   
4                               Knee brace with heat   

                                         OTC PrePain  \
0  Please rate the level of knee joint pain you e...   
1                                                  6   
2                                                  7   
3                                                  5   
4                                                  5   

                                        OTC PostPain  \
0  Please rate the level of knee joint pain you e...   
1                                                 10   
2                                                  9   
3                                                  7   
4                           

In [43]:
# preprocessing.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

# 1. Load CSV, strip whitespace, rename columns
df = pd.read_csv('OTC-Data.csv', skipinitialspace=True)
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Best OTC': 'best_otc',
    'OTC PrePain': 'otc_prepain',
    'OTC PostPain': 'otc_postpain',
    'OTCSleep': 'otc_sleep',
    'OTC Cause': 'otc_cause',
    'OTC PainLocation': 'otc_pain_location',
    'OTC PainTime': 'otc_pain_time',
    'OTC CocomtSymptom': 'otc_cocomt_symptom',
    'Gender': 'gender',
    'Age': 'age',
    'Height': 'height',
    'Weight': 'weight',
    'Ethnicity': 'ethnicity',
    'Race': 'race'
})

# 2. Numeric conversion & drop rows missing numeric inputs
num_cols = ['otc_prepain', 'otc_postpain', 'age', 'height', 'weight']
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df = df.dropna(subset=num_cols)

# 3. Drop rows missing the target
df = df[df['best_otc'].notna()]

# 4. Split features/target and do train/test split (no stratify)
X = df.drop(columns=['best_otc'])
y = df['best_otc']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5. Build preprocessing pipelines
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_cols = [
    'otc_sleep', 'otc_cause', 'otc_pain_location',
    'otc_pain_time', 'otc_cocomt_symptom',
    'gender', 'ethnicity', 'race'
]
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# 6. Fit & transform training data, then save
X_train_prepared = preprocessor.fit_transform(X_train)
joblib.dump(preprocessor, 'otc_preprocessor.pkl')

# 7. Diagnostic prints
print("✅ Preprocessor saved to otc_preprocessor.pkl")
print(f"Dataset shape after cleaning: {df.shape}")
print(f"X_train shape before prep: {X_train.shape}")
print(f"X_train shape after prep : {X_train_prepared.shape}")


✅ Preprocessor saved to otc_preprocessor.pkl
Dataset shape after cleaning: (1601, 14)
X_train shape before prep: (1280, 13)
X_train shape after prep : (1280, 700)


In [46]:
# model_training_adasyn.py

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Reload & clean
df = pd.read_csv('OTC-Data.csv', skipinitialspace=True)
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Best OTC': 'best_otc',
    'OTC PrePain': 'otc_prepain',
    'OTC PostPain': 'otc_postpain',
    'OTCSleep': 'otc_sleep',
    'OTC Cause': 'otc_cause',
    'OTC PainLocation': 'otc_pain_location',
    'OTC PainTime': 'otc_pain_time',
    'OTC CocomtSymptom': 'otc_cocomt_symptom',
    'Gender': 'gender',
    'Age': 'age',
    'Height': 'height',
    'Weight': 'weight',
    'Ethnicity': 'ethnicity',
    'Race': 'race'
})
for c in ['otc_prepain','otc_postpain','age','height','weight']:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df = df.dropna(subset=['otc_prepain','otc_postpain','age','height','weight','best_otc'])

# 2. (Optional) drop singleton classes if you still have any
vc = df['best_otc'].value_counts()
singletons = vc[vc == 1].index
if len(singletons):
    print(f"Dropping singleton-target classes: {list(singletons)}")
    df = df[~df['best_otc'].isin(singletons)]

# 3. Split raw features
X = df.drop(columns=['best_otc'])
y = df['best_otc']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. Load & apply preprocessor
preprocessor = joblib.load('otc_preprocessor.pkl')
X_train_prep = preprocessor.transform(X_train)
X_test_prep  = preprocessor.transform(X_test)

# 5. Apply ADASYN
adasyn = ADASYN(random_state=42)
X_res, y_res = adasyn.fit_resample(X_train_prep, y_train)
print(f"After ADASYN: {X_res.shape[0]} samples (was {X_train_prep.shape[0]})")

# 6. Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_res, y_res)

# 7. Evaluate
y_pred = clf.predict(X_test_prep)
print(f"\nAccuracy on test set: {accuracy_score(y_test, y_pred):.3%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))




Dropping singleton-target classes: ['Cortex daphnes (zushima patches) ', '31', 'Foam roller']


ValueError: No samples will be generated with the provided ratio settings.

In [47]:
from collections import Counter
print("Before ADASYN:", Counter(y_train))


Before ADASYN: Counter({'Acetaminophen (Tylenol)': 193, '   Ibuprofen (Advil, Motrin, Nuprin)': 190, 'Voltaren Gel (Diclofenac gel)': 172, '   Naproxen (Aleve, Anaprox, Naprosyn)': 133, 'Heating pad': 62, 'Biofreeze Pain Relief Gel': 62, 'Ice pack': 51, 'Icy Hot Cream/gel': 45, 'Aspercreme with lidocaine (cream)': 40, 'Orthosis (brace, arch support, strap, etc.)': 40, 'Salonpas Lidocane Paine Relieving Gel-patch ': 38, 'Acetaminophen & aspirin combination (Excedrin)': 32, '   Glucosamine (e.g., Move Free)': 27, 'Turmeric/curcumin': 20, '   Tiger balm (camphor, menthol) ': 19, 'Massager with heat': 15, 'Electric stimulator': 14, 'Blue Emu arthritis cream': 13, 'Bengay Cream': 12, 'Aspirin (Bayer)': 12, 'The product I liked best is not listed above. Please specify below:': 9, 'Capsaicin creams, gels, or patches (Capzasin)': 9, 'Collagen': 9, 'Fish oil': 9, 'Knee brace with magnets': 8, 'Knee brace with heat': 7, 'Hyaluronic Acid': 7, 'Chondroitin': 6, 'Massager with infrared light': 5, '

In [51]:
from collections import Counter
from imblearn.over_sampling import ADASYN

# after loading & cleaning, splitting, and preprocessing:
# X_train_prep, y_train

counts = Counter(y_train)
majority = max(counts.values())           # e.g. ~193
target_ratio = 0.5                        # bring each minority to 50% of majority
sampling_strategy = {
    cls: int(majority * target_ratio)
    for cls, cnt in counts.items()
    if cnt < majority * target_ratio
}

print("Pre-ADASYN class counts:", counts)
print("Sampling strategy dict:", sampling_strategy)

ada = ADASYN(
    sampling_strategy=sampling_strategy,
    n_neighbors=1,   # only need 1 neighbor so tiny classes (≥2 samples) work
    random_state=42
)

X_res, y_res = ada.fit_resample(X_train_prep, y_train)
print("Post-ADASYN class counts:", Counter(y_res))


Pre-ADASYN class counts: Counter({'Acetaminophen (Tylenol)': 193, '   Ibuprofen (Advil, Motrin, Nuprin)': 190, 'Voltaren Gel (Diclofenac gel)': 172, '   Naproxen (Aleve, Anaprox, Naprosyn)': 133, 'Heating pad': 62, 'Biofreeze Pain Relief Gel': 62, 'Ice pack': 51, 'Icy Hot Cream/gel': 45, 'Aspercreme with lidocaine (cream)': 40, 'Orthosis (brace, arch support, strap, etc.)': 40, 'Salonpas Lidocane Paine Relieving Gel-patch ': 38, 'Acetaminophen & aspirin combination (Excedrin)': 32, '   Glucosamine (e.g., Move Free)': 27, 'Turmeric/curcumin': 20, '   Tiger balm (camphor, menthol) ': 19, 'Massager with heat': 15, 'Electric stimulator': 14, 'Blue Emu arthritis cream': 13, 'Bengay Cream': 12, 'Aspirin (Bayer)': 12, 'The product I liked best is not listed above. Please specify below:': 9, 'Capsaicin creams, gels, or patches (Capzasin)': 9, 'Collagen': 9, 'Fish oil': 9, 'Knee brace with magnets': 8, 'Knee brace with heat': 7, 'Hyaluronic Acid': 7, 'Chondroitin': 6, 'Massager with infrared li

In [53]:
# model_training_adasyn_custom.py

import pandas as pd
import joblib
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, top_k_accuracy_score

# 1. Reload & clean
df = pd.read_csv('OTC-Data.csv', skipinitialspace=True)
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Best OTC': 'best_otc',
    'OTC PrePain': 'otc_prepain',
    'OTC PostPain': 'otc_postpain',
    'OTCSleep': 'otc_sleep',
    'OTC Cause': 'otc_cause',
    'OTC PainLocation': 'otc_pain_location',
    'OTC PainTime': 'otc_pain_time',
    'OTC CocomtSymptom': 'otc_cocomt_symptom',
    'Gender': 'gender',
    'Age': 'age',
    'Height': 'height',
    'Weight': 'weight',
    'Ethnicity': 'ethnicity',
    'Race': 'race'
})
for c in ['otc_prepain','otc_postpain','age','height','weight']:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df = df.dropna(subset=['otc_prepain','otc_postpain','age','height','weight','best_otc'])

# 2. (Optional) Drop any singleton classes
vc = df['best_otc'].value_counts()
singletons = vc[vc == 1].index
if len(singletons):
    print(f"Dropping singleton classes: {list(singletons)}")
    df = df[~df['best_otc'].isin(singletons)]

# 3. Split into train/test (raw features)
X = df.drop(columns=['best_otc'])
y = df['best_otc']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. Preprocess
preprocessor = joblib.load('otc_preprocessor.pkl')
X_train_prep = preprocessor.transform(X_train)
X_test_prep  = preprocessor.transform(X_test)

# 5. Build ADASYN sampling_strategy to 50% of majority
counts   = Counter(y_train)
majority = max(counts.values())
target_ratio = 0.5  # 50% of majority
sampling_strategy = {
    cls: int(majority * target_ratio)
    for cls, cnt in counts.items()
    if cnt < majority * target_ratio
}
print("Pre-ADASYN counts:", counts)
print("Sampling strategy:", sampling_strategy)

# 6. Apply ADASYN with n_neighbors=1
ada = ADASYN(
    sampling_strategy=sampling_strategy,
    n_neighbors=1,
    random_state=42
)
X_res, y_res = ada.fit_resample(X_train_prep, y_train)
print("Post-ADASYN counts:", Counter(y_res))

# 7. Train RandomForest
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_res, y_res)

# 8. Evaluate
y_pred  = clf.predict(X_test_prep)
y_proba = clf.predict_proba(X_test_prep)

print(f"\nOverall accuracy: {accuracy_score(y_test, y_pred):.3%}")
print("Classification report:\n", classification_report(y_test, y_pred))

# Top-3 accuracy needs the full label set
top3 = top_k_accuracy_score(
    y_test,
    y_proba,
    k=3,
    labels=clf.classes_
)
print(f"Top-3 accuracy: {top3:.3%}")



Dropping singleton classes: ['Cortex daphnes (zushima patches) ', '31', 'Foam roller']
Pre-ADASYN counts: Counter({'Acetaminophen (Tylenol)': 193, '   Ibuprofen (Advil, Motrin, Nuprin)': 190, 'Voltaren Gel (Diclofenac gel)': 172, '   Naproxen (Aleve, Anaprox, Naprosyn)': 133, 'Heating pad': 62, 'Biofreeze Pain Relief Gel': 62, 'Ice pack': 51, 'Icy Hot Cream/gel': 45, 'Aspercreme with lidocaine (cream)': 40, 'Orthosis (brace, arch support, strap, etc.)': 40, 'Salonpas Lidocane Paine Relieving Gel-patch ': 38, 'Acetaminophen & aspirin combination (Excedrin)': 32, '   Glucosamine (e.g., Move Free)': 27, 'Turmeric/curcumin': 20, '   Tiger balm (camphor, menthol) ': 19, 'Massager with heat': 15, 'Electric stimulator': 14, 'Blue Emu arthritis cream': 13, 'Bengay Cream': 12, 'Aspirin (Bayer)': 12, 'The product I liked best is not listed above. Please specify below:': 9, 'Capsaicin creams, gels, or patches (Capzasin)': 9, 'Collagen': 9, 'Fish oil': 9, 'Knee brace with magnets': 8, 'Knee brace 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [57]:
# model_training_catboost_grouped.py

import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, top_k_accuracy_score

# 1. Load & clean
df = pd.read_csv('OTC-Data.csv', skipinitialspace=True)
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Best OTC': 'best_otc',
    'OTC PrePain': 'otc_prepain',
    'OTC PostPain': 'otc_postpain',
    'OTCSleep': 'otc_sleep',
    'OTC Cause': 'otc_cause',
    'OTC PainLocation': 'otc_pain_location',
    'OTC PainTime': 'otc_pain_time',
    'OTC CocomtSymptom': 'otc_cocomt_symptom',
    'Gender': 'gender',
    'Age': 'age',
    'Height': 'height',
    'Weight': 'weight',
    'Ethnicity': 'ethnicity',
    'Race': 'race'
})

# 2. Convert numeric columns and drop rows missing them or the target
for c in ['otc_prepain', 'otc_postpain', 'age', 'height', 'weight']:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df = df.dropna(subset=['otc_prepain', 'otc_postpain', 'age', 'height', 'weight', 'best_otc'])

# 3. Group rare classes (<5 samples) into "Other"
counts = df['best_otc'].value_counts()
rare_labels = counts[counts < 5].index
df['best_otc'] = df['best_otc'].apply(lambda x: 'Other' if x in rare_labels else x)

# 4. Feature engineering
# 4a. BMI
df['bmi'] = (df['weight'] * 0.453592) / ((df['height'] * 0.0254) ** 2)
# 4b. Symptom count
df['symptom_count'] = df['otc_cocomt_symptom'].apply(
    lambda x: 0 if pd.isna(x) or 'None' in x else len(x.split(','))
)
# 4c. Drop raw symptom column
df = df.drop(columns=['otc_cocomt_symptom'])

# 5. Split into train/test
X = df.drop(columns=['best_otc'])
y = df['best_otc']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. Identify categorical features
cat_features = [
    'otc_sleep', 'otc_cause', 'otc_pain_location',
    'otc_pain_time', 'gender', 'ethnicity', 'race'
]

# 7. Build CatBoost Pools
train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
test_pool  = Pool(X_test,  label=y_test,  cat_features=cat_features)

# 8. Train CatBoost
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=100
)
model.fit(train_pool, eval_set=test_pool)

# 9. Evaluate
y_pred  = model.predict(test_pool)
y_proba = model.predict_proba(test_pool)

print(f"\nOverall accuracy: {accuracy_score(y_test, y_pred):.3%}")
print("Classification report:\n", classification_report(y_test, y_pred))

top3 = top_k_accuracy_score(
    y_test,
    y_proba,
    k=3,
    labels=model.classes_
)
print(f"Top-3 accuracy: {top3:.3%}")




0:	learn: 0.1484375	test: 0.1464174	best: 0.1464174 (0)	total: 66.1ms	remaining: 33s
100:	learn: 0.4453125	test: 0.1401869	best: 0.1588785 (22)	total: 10.6s	remaining: 42s
200:	learn: 0.7671875	test: 0.1619938	best: 0.1713396 (116)	total: 22.2s	remaining: 33s
300:	learn: 0.9320312	test: 0.1495327	best: 0.1713396 (116)	total: 34.2s	remaining: 22.6s
400:	learn: 0.9914063	test: 0.1495327	best: 0.1713396 (116)	total: 45.7s	remaining: 11.3s
499:	learn: 0.9992188	test: 0.1401869	best: 0.1713396 (116)	total: 57.2s	remaining: 0us

bestTest = 0.1713395639
bestIteration = 116

Shrink model to first 117 iterations.

Overall accuracy: 17.134%
Classification report:
                                                                precision    recall  f1-score   support

                                Glucosamine (e.g., Move Free)       0.00      0.00      0.00         2
                            Ibuprofen (Advil, Motrin, Nuprin)       0.16      0.66      0.26        47
                          N

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [70]:
# kfold_adasyn_safe.py

import pandas as pd
import joblib
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, top_k_accuracy_score

# ADASYN for oversampling
from imblearn.over_sampling import ADASYN

# 1. Load & clean
df = pd.read_csv('OTC-Data.csv', skipinitialspace=True)
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Best OTC': 'best_otc',
    'OTC PrePain': 'otc_prepain',
    'OTC PostPain': 'otc_postpain',
    'OTCSleep': 'otc_sleep',
    'OTC Cause': 'otc_cause',
    'OTC PainLocation': 'otc_pain_location',
    'OTC PainTime': 'otc_pain_time',
    'OTC CocomtSymptom': 'otc_cocomt_symptom',
    'Gender': 'gender',
    'Age': 'age',
    'Height': 'height',
    'Weight': 'weight',
    'Ethnicity': 'ethnicity',
    'Race': 'race'
})
# Numeric conversion
for c in ['otc_prepain','otc_postpain','age','height','weight']:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df.dropna(subset=['otc_prepain','otc_postpain','age','height','weight','best_otc'], inplace=True)

# 2. Global grouping: any class with <5 into "Other"
vc = df['best_otc'].value_counts()
rare = vc[vc < 5].index
df['best_otc'] = df['best_otc'].apply(lambda x: 'Other' if x in rare else x)

# 3. Prepare X, y
X = df.drop(columns=['best_otc'])
y = df['best_otc']

# 4. Load the preprocessor you saved earlier
preprocessor = joblib.load('otc_preprocessor.pkl')

# 5. Stratified 5-fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies = []
fold_top3 = []

print("Starting 5-Fold CV with ADASYN (safe mode)\n")

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
    # Split
    X_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx]
    X_te, y_te = X.iloc[test_idx],  y.iloc[test_idx]
    
    # Preprocess
    X_tr_p = preprocessor.fit_transform(X_tr)
    X_te_p = preprocessor.transform(X_te)
    
    # Build a 50%-of-majority sampling_strategy
    counts = Counter(y_tr)
    majority = max(counts.values())
    sampling_strategy = {
        cls: int(majority * 0.5)
        for cls, cnt in counts.items()
        if cnt < majority * 0.5 and cnt >= 2
    }
    
    # Try ADASYN; if it complains "no samples to generate", skip it
    X_res, y_res = X_tr_p, y_tr
    if sampling_strategy:
        ada = ADASYN(sampling_strategy=sampling_strategy,
                     n_neighbors=1,
                     random_state=42)
        try:
            X_res, y_res = ada.fit_resample(X_tr_p, y_tr)
        except ValueError as e:
            print(f"  Fold {fold}: ADASYN skipped ({e})")
    
    # Train & evaluate
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_res, y_res)
    
    y_pred  = clf.predict(X_te_p)
    y_proba = clf.predict_proba(X_te_p)
    
    acc = accuracy_score(y_te, y_pred)
    t3  = top_k_accuracy_score(y_te, y_proba, k=3, labels=clf.classes_)
    
    print(f"Fold {fold}: Accuracy = {acc:.3%}, Top-3 = {t3:.3%}")
    fold_accuracies.append(acc)
    fold_top3.append(t3)

# 6. Summary
import numpy as np
print("\n5-Fold CV Summary:")
print(f"Mean Acc    : {np.mean(fold_accuracies):.3%} ± {np.std(fold_accuracies):.3%}")
print(f"Mean Top-3  : {np.mean(fold_top3):.3%} ± {np.std(fold_top3):.3%}")


Starting 5-Fold CV with ADASYN (safe mode)

  Fold 1: ADASYN skipped (No samples will be generated with the provided ratio settings.)
Fold 1: Accuracy = 14.330%, Top-3 = 43.614%
  Fold 2: ADASYN skipped (No samples will be generated with the provided ratio settings.)
Fold 2: Accuracy = 18.125%, Top-3 = 40.312%
  Fold 3: ADASYN skipped (No samples will be generated with the provided ratio settings.)
Fold 3: Accuracy = 16.562%, Top-3 = 38.125%
  Fold 4: ADASYN skipped (No samples will be generated with the provided ratio settings.)
Fold 4: Accuracy = 16.250%, Top-3 = 40.625%
  Fold 5: ADASYN skipped (No samples will be generated with the provided ratio settings.)
Fold 5: Accuracy = 13.438%, Top-3 = 39.375%

5-Fold CV Summary:
Mean Acc    : 15.741% ± 1.668%
Mean Top-3  : 40.410% ± 1.823%


In [75]:
# kfold_adasyn_safe.py

import pandas as pd
import joblib
from collections import Counter
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, top_k_accuracy_score

# ADASYN for oversampling
from imblearn.over_sampling import ADASYN

# 1. Load & clean
df = pd.read_csv('OTC-Data.csv', skipinitialspace=True)
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Best OTC': 'best_otc',
    'OTC PrePain': 'otc_prepain',
    'OTC PostPain': 'otc_postpain',
    'OTCSleep': 'otc_sleep',
    'OTC Cause': 'otc_cause',
    'OTC PainLocation': 'otc_pain_location',
    'OTC PainTime': 'otc_pain_time',
    'OTC CocomtSymptom': 'otc_cocomt_symptom',
    'Gender': 'gender',
    'Age': 'age',
    'Height': 'height',
    'Weight': 'weight',
    'Ethnicity': 'ethnicity',
    'Race': 'race'
})
for c in ['otc_prepain','otc_postpain','age','height','weight']:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df.dropna(subset=['otc_prepain','otc_postpain','age','height','weight','best_otc'], inplace=True)

# 2. Global grouping of rare classes (<5) into "Other"
vc = df['best_otc'].value_counts()
rare = vc[vc < 5].index
df['best_otc'] = df['best_otc'].apply(lambda x: 'Other' if x in rare else x)

# 3. Prepare features/target
X = df.drop(columns=['best_otc'])
y = df['best_otc']

# 4. Load preprocessor pipeline
preprocessor = joblib.load('otc_preprocessor.pkl')

# 5. 5‐fold CV (safe ADASYN)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_acc, fold_top3 = [], []

print("Starting 5-Fold CV with ADASYN (safe mode)\n")
for fold, (tr_idx, te_idx) in enumerate(skf.split(X, y), start=1):
    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    X_te, y_te = X.iloc[te_idx], y.iloc[te_idx]

    X_tr_p = preprocessor.fit_transform(X_tr)
    X_te_p = preprocessor.transform(X_te)

    counts   = Counter(y_tr)
    majority = max(counts.values())
    strat    = {cls: int(majority*0.5) for cls,cnt in counts.items() if cnt>=2 and cnt<majority*0.5}

    X_res, y_res = X_tr_p, y_tr
    if strat:
        ada = ADASYN(sampling_strategy=strat, n_neighbors=1, random_state=42)
        try:
            X_res, y_res = ada.fit_resample(X_tr_p, y_tr)
        except ValueError as e:
            print(f"  Fold {fold}: ADASYN skipped ({e})")

    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_res, y_res)

    y_pred  = clf.predict(X_te_p)
    y_proba = clf.predict_proba(X_te_p)

    acc = accuracy_score(y_te, y_pred)
    t3  = top_k_accuracy_score(y_te, y_proba, k=3, labels=clf.classes_)
    print(f"Fold {fold}: Accuracy = {acc:.3%}, Top-3 = {t3:.3%}")

    fold_acc.append(acc)
    fold_top3.append(t3)

import numpy as np
print("\n5-Fold CV Summary:")
print(f"Mean Acc    : {np.mean(fold_acc):.3%} ± {np.std(fold_acc):.3%}")
print(f"Mean Top-3  : {np.mean(fold_top3):.3%} ± {np.std(fold_top3):.3%}")

# 6. Train final model on ALL data

# Preprocess full training set
X_all_p = preprocessor.transform(X)
# Build final sampling strategy (bring every minority up to majority)
counts   = Counter(y)
majority = max(counts.values())
final_strat = {cls: (majority - cnt) for cls,cnt in counts.items() if cnt>=2 and cnt<majority}

X_res_all, y_res_all = X_all_p, y
if final_strat:
    ada_all = ADASYN(sampling_strategy=final_strat, n_neighbors=1, random_state=42)
    try:
        X_res_all, y_res_all = ada_all.fit_resample(X_all_p, y)
        print(f"\nResampled full dataset: {X_all_p.shape[0]} → {X_res_all.shape[0]}")
    except ValueError as e:
        print(f"\nFinal ADASYN skipped: {e}")

final_clf = RandomForestClassifier(n_estimators=200, random_state=42)
final_clf.fit(X_res_all, y_res_all)

# 7. Save final model
joblib.dump(final_clf, 'otc_final_model.pkl')
print("✅ Saved final model to otc_final_model.pkl")


Starting 5-Fold CV with ADASYN (safe mode)

  Fold 1: ADASYN skipped (No samples will be generated with the provided ratio settings.)
Fold 1: Accuracy = 14.330%, Top-3 = 43.614%
  Fold 2: ADASYN skipped (No samples will be generated with the provided ratio settings.)
Fold 2: Accuracy = 18.125%, Top-3 = 40.312%
  Fold 3: ADASYN skipped (No samples will be generated with the provided ratio settings.)
Fold 3: Accuracy = 16.562%, Top-3 = 38.125%
  Fold 4: ADASYN skipped (No samples will be generated with the provided ratio settings.)
Fold 4: Accuracy = 16.250%, Top-3 = 40.625%
  Fold 5: ADASYN skipped (No samples will be generated with the provided ratio settings.)
Fold 5: Accuracy = 13.438%, Top-3 = 39.375%

5-Fold CV Summary:
Mean Acc    : 15.741% ± 1.668%
Mean Top-3  : 40.410% ± 1.823%

Final ADASYN skipped: With over-sampling methods, the number of samples in a class should be greater or equal to the original number of samples. Originally, there is 237 samples and 10 samples are asked.

In [76]:
# refit_preprocessor.py

import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# 1. Load & clean exactly as in your training scripts
df = pd.read_csv('OTC-Data.csv', skipinitialspace=True)
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Best OTC': 'best_otc',
    'OTC PrePain': 'otc_prepain',
    'OTC PostPain': 'otc_postpain',
    'OTCSleep': 'otc_sleep',
    'OTC Cause': 'otc_cause',
    'OTC PainLocation': 'otc_pain_location',
    'OTC PainTime': 'otc_pain_time',
    'OTC CocomtSymptom': 'otc_cocomt_symptom',
    'Gender': 'gender',
    'Age': 'age',
    'Height': 'height',
    'Weight': 'weight',
    'Ethnicity': 'ethnicity',
    'Race': 'race'
})
# Numeric conversion & drop any rows missing core inputs
for c in ['otc_prepain','otc_postpain','age','height','weight']:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df = df.dropna(subset=['otc_prepain','otc_postpain','age','height','weight','best_otc'])

# Global group rares (<5) into "Other"
vc = df['best_otc'].value_counts()
rare = vc[vc < 5].index
df['best_otc'] = df['best_otc'].apply(lambda x: 'Other' if x in rare else x)

# 2. Define feature columns exactly as in preprocessing.py
numeric_cols = ['otc_prepain','otc_postpain','age','height','weight']
categorical_cols = [
    'otc_sleep','otc_cause','otc_pain_location',
    'otc_pain_time','otc_cocomt_symptom',
    'gender','ethnicity','race'
]

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',  StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',  OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# 3. Fit on the entire feature set
X_full = df[numeric_cols + categorical_cols]
preprocessor.fit(X_full)

# 4. Save it
joblib.dump(preprocessor, 'otc_preprocessor_full.pkl')
print("✅ Saved full‐data preprocessor to otc_preprocessor_full.pkl")


✅ Saved full‐data preprocessor to otc_preprocessor_full.pkl


In [80]:
import pandas as pd
import joblib
from collections import Counter

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import ADASYN

# 1) Load & clean
df = pd.read_csv('OTC-Data.csv', skipinitialspace=True)
df.columns = df.columns.str.strip()
df = df.rename(columns={
    'Best OTC':'best_otc','OTC PrePain':'otc_prepain','OTC PostPain':'otc_postpain',
    'OTCSleep':'otc_sleep','OTC Cause':'otc_cause','OTC PainLocation':'otc_pain_location',
    'OTC PainTime':'otc_pain_time','OTC CocomtSymptom':'otc_cocomt_symptom',
    'Gender':'gender','Age':'age','Height':'height','Weight':'weight',
    'Ethnicity':'ethnicity','Race':'race'
})
for c in ['otc_prepain','otc_postpain','age','height','weight']:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df.dropna(subset=['otc_prepain','otc_postpain','age','height','weight','best_otc'], inplace=True)

# 2) Group rares (<5) → "Other"
vc = df['best_otc'].value_counts()
rare = vc[vc < 5].index
df['best_otc'] = df['best_otc'].apply(lambda x: 'Other' if x in rare else x)

# 3) Define feature lists
numeric_cols = ['otc_prepain','otc_postpain','age','height','weight']
categorical_cols = [
    'otc_sleep','otc_cause','otc_pain_location',
    'otc_pain_time','otc_cocomt_symptom',
    'gender','ethnicity','race'
]

# 4) Preprocessor
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler',  StandardScaler())
    ]), numeric_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot',  OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols),
])

# 5) Transform entire dataset
X = df[numeric_cols + categorical_cols]
y = df['best_otc']
X_p = preprocessor.fit_transform(X)

# 6) ADASYN oversample once
counts   = Counter(y)
majority = max(counts.values())
strategy = {cls: majority for cls,cnt in counts.items() if cnt < majority}
X_res, y_res = ADASYN(sampling_strategy=strategy, n_neighbors=1, random_state=42).fit_resample(X_p, y)

# 7) Train classifier
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_res, y_res)

# 8) Save both
joblib.dump(preprocessor, 'otc_preprocessor_final.pkl')
joblib.dump(clf, 'otc_classifier_final.pkl')
print("✅ Saved preprocessor and classifier.")


ValueError: No samples will be generated with the provided ratio settings.