1. Setup Environment

Install required packages.

In [37]:
%pip install -q transformers==4.20.1 datasets==2.10.0 pandas==1.4.2 numpy==1.22.4 scikit-learn==1.1.1 torch==1.11.0 nltk==3.7 imbalanced-learn==0.9.1 optuna==3.0.0

Note: you may need to restart the kernel to use updated packages.


In [38]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
import re
import warnings
warnings.filterwarnings('ignore')

In [39]:
# Download required NLTK data
nltk_downloads = ['stopwords', 'punkt', 'wordnet', 'averaged_perceptron_tagger', 'omw-1.4']
for item in nltk_downloads:
    try:
        nltk.data.find(f'tokenizers/{item}' if item == 'punkt' else f'corpora/{item}')
    except LookupError:
        nltk.download(item)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\satvi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\satvi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\satvi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [40]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)


Using device: cpu


2. Create and Preprocess drug_use_data.csv

Load SetFit/ade_corpus_v2_classification train split, create CSV, and preprocess.

In [41]:
import re
import urllib.request
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
import nltk
from nltk.corpus import stopwords

# Download required NLTK data
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

# Define splits
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}

# Load via hf:// protocol
try:
    df = pd.read_json("hf://datasets/SetFit/ade_corpus_v2_classification/" + splits["train"], lines=True)
except Exception as e:
    print(f"hf:// loading failed: {e}")
    print("Falling back to direct URL...")
    url = "https://huggingface.co/datasets/SetFit/ade_corpus_v2_classification/resolve/main/train.jsonl"
    urllib.request.urlretrieve(url, "train.jsonl")
    df = pd.read_json("train.jsonl", lines=True)

# Expanded substance and symptom lists
substance_map = {
    'morphine': 'opioid', 'oxycodone': 'opioid', 'fentanyl': 'opioid', 'hydrocodone': 'opioid',
    'heroin': 'opioid', 'codeine': 'opioid', 'tramadol': 'opioid',
    'cocaine': 'stimulant', 'methamphetamine': 'stimulant', 'amphetamine': 'stimulant',
    'placebo': 'none', 'heparin': 'none'
}
symptom_list = ['nausea', 'confusion', 'drowsiness', 'overdose', 'dizziness', 'vomiting',
                'fatigue', 'headache', 'anxiety', 'seizure', 'hematoma', 'rash', 'pain',
                'constipation', 'dyspnea', 'pruritus']

def assign_labels(text, original_label=None):
    substance = 'none'
    symptoms = []
    text_lower = str(text).lower()
    
    # Check for substances
    for drug, subst in substance_map.items():
        if drug in text_lower:
            substance = subst
            break
    
    # Check for symptoms
    for symp in symptom_list:
        if symp in text_lower:
            symptoms.append(symp)
    
    # Use original ADE label if available and no symptoms found
    if original_label == 1 and not symptoms:
        symptoms = ['adverse_event']
    
    return substance, symptoms if symptoms else ['none']

# Apply labels with original label information
if 'label' in df.columns:
    df['substance_label'], df['symptom_labels'] = zip(*[
        assign_labels(text, label) for text, label in zip(df['text'], df['label'])
    ])
else:
    df['substance_label'], df['symptom_labels'] = zip(*df['text'].apply(lambda x: assign_labels(x)))

# Save to CSV BEFORE any processing that might duplicate data
df[['text', 'substance_label', 'symptom_labels']].to_csv('drug_use_data.csv', index=False)
print('Dataset saved as drug_use_data.csv')

# Preprocess text
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

df['text'] = df['text'].apply(preprocess_text)

# Encode labels
substance_classes = df['substance_label'].unique()
substance2id = {label: idx for idx, label in enumerate(substance_classes)}
df['substance_label'] = df['substance_label'].map(substance2id)

mlb = MultiLabelBinarizer()
symptom_encoded = mlb.fit_transform(df['symptom_labels'])
symptom_df = pd.DataFrame(symptom_encoded, columns=mlb.classes_)
symptom_columns = mlb.classes_

# Combine dataframes
df = pd.concat([df[['text', 'substance_label']], symptom_df], axis=1)

# Apply SMOTE for balanced training data
print("Original distribution:", Counter(df['substance_label']))

# Use TF-IDF features for SMOTE
temp_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_tfidf_temp = temp_vectorizer.fit_transform(df['text']).toarray()

try:
    smote = SMOTE(random_state=42, k_neighbors=min(3, Counter(df['substance_label']).most_common()[-1][1] - 1))
    X_balanced, y_balanced = smote.fit_resample(X_tfidf_temp, df['substance_label'])
    
    # Create balanced dataframe by finding closest matches
    balanced_indices = []
    for x_sample in X_balanced:
        similarities = np.dot(X_tfidf_temp, x_sample)
        closest_idx = np.argmax(similarities)
        balanced_indices.append(closest_idx)
    
    df_balanced = df.iloc[balanced_indices].copy()
    df_balanced['substance_label'] = y_balanced
    df = df_balanced
    
    print("Balanced distribution:", Counter(df['substance_label']))
except ValueError as e:
    print(f"SMOTE failed: {e}, using original data with manual balancing")
    # Fallback: simple oversampling for minority classes
    minority_threshold = len(df) * 0.1  # 10% threshold
    minority_data = []
    for label in df['substance_label'].unique():
        label_data = df[df['substance_label'] == label]
        if len(label_data) < minority_threshold:
            # Duplicate minority class samples
            multiplier = int(minority_threshold / len(label_data)) + 1
            minority_data.append(pd.concat([label_data] * multiplier, ignore_index=True))
    
    if minority_data:
        df = pd.concat([df] + minority_data, ignore_index=True)
        print("Manual balancing applied")

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['substance_label'])

print(f'Training samples: {len(train_df)}, Test samples: {len(test_df)}')

# Print final data info
print("\nFinal dataset info:")
print(f"Substance classes: {list(substance2id.keys())}")
print(f"Symptom classes: {list(symptom_columns)}")
print(f"Total features: text + {len(substance2id)} substance classes + {len(symptom_columns)} symptom classes")

Dataset saved as drug_use_data.csv
Original distribution: Counter({0: 17510, 1: 102, 2: 25})
Balanced distribution: Counter({0: 18379, 1: 17512, 2: 16639})
Training samples: 42024, Test samples: 10506

Final dataset info:
Substance classes: ['none', 'opioid', 'stimulant']
Symptom classes: ['adverse_event', 'anxiety', 'confusion', 'constipation', 'dizziness', 'drowsiness', 'dyspnea', 'fatigue', 'headache', 'hematoma', 'nausea', 'none', 'overdose', 'pain', 'pruritus', 'rash', 'seizure', 'vomiting']
Total features: text + 3 substance classes + 18 symptom classes


3. Create TF-IDF Features and Datasets

Use TF-IDF features and create custom dataset.

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import numpy as np
from scipy import sparse

# Create TF-IDF features with reduced memory footprint
print("Creating TF-IDF features...")
vectorizer = TfidfVectorizer(
    max_features=2000,  # Reduced from 5000 to save memory
    stop_words='english', 
    ngram_range=(1, 2),  # Reduced from (1,3) to save memory
    dtype=np.float32,
    min_df=3,  # Increased to reduce vocabulary size
    max_df=0.90  # More aggressive filtering
)

# Keep matrices in sparse format - DON'T convert to dense arrays
X_train_tfidf_sparse = vectorizer.fit_transform(train_df['text'])
X_test_tfidf_sparse = vectorizer.transform(test_df['text'])

print(f"TF-IDF sparse matrix shape: {X_train_tfidf_sparse.shape}")
print(f"Memory usage (sparse): ~{X_train_tfidf_sparse.data.nbytes / 1024**2:.1f} MB")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

# Convert sparse matrices to dense in smaller batches to avoid memory issues
def sparse_to_dense_batched(sparse_matrix, batch_size=1000):
    """Convert sparse matrix to dense in batches to manage memory"""
    n_samples = sparse_matrix.shape[0]
    n_features = sparse_matrix.shape[1]
    
    # Pre-allocate dense array
    dense_array = np.zeros((n_samples, n_features), dtype=np.float32)
    
    # Process in batches
    for i in range(0, n_samples, batch_size):
        end_idx = min(i + batch_size, n_samples)
        batch_sparse = sparse_matrix[i:end_idx]
        dense_array[i:end_idx] = batch_sparse.toarray()
        
        if i % (batch_size * 10) == 0:  # Progress update every 10 batches
            print(f"Processed {i}/{n_samples} samples...")
    
    return dense_array

print("Converting sparse matrices to dense (this may take a moment)...")
try:
    X_train_tfidf = sparse_to_dense_batched(X_train_tfidf_sparse, batch_size=500)
    X_test_tfidf = sparse_to_dense_batched(X_test_tfidf_sparse, batch_size=500)
    print("Conversion completed successfully!")
except MemoryError:
    print("Still not enough memory. Using even smaller batch size...")
    try:
        X_train_tfidf = sparse_to_dense_batched(X_train_tfidf_sparse, batch_size=100)
        X_test_tfidf = sparse_to_dense_batched(X_test_tfidf_sparse, batch_size=100)
        print("Conversion completed with smaller batches!")
    except MemoryError:
        print("Memory still insufficient. Switching to sparse-compatible approach...")
        # Alternative: Work directly with sparse matrices (requires model modification)
        raise MemoryError("Consider using a machine with more RAM or further reducing max_features")

print(f"Final TF-IDF feature shape: {X_train_tfidf.shape}")

# Memory-efficient dataset class
class TFIDFDataset(torch.utils.data.Dataset):
    def __init__(self, features, substance_labels, symptom_labels):
        # Store as numpy arrays to save memory compared to tensors
        self.features = features.astype(np.float32)
        self.substance_labels = substance_labels.astype(np.int64)
        self.symptom_labels = symptom_labels.astype(np.float32)
        
        print(f"Dataset created with {len(self.features)} samples")
        print(f"Feature shape: {self.features.shape}")
        print(f"Memory usage: ~{self.features.nbytes / 1024**2:.1f} MB")
    
    def __getitem__(self, idx):
        # Convert to tensors only when needed (lazy loading)
        return {
            'x': torch.from_numpy(self.features[idx]).float(),
            'substance_labels': torch.from_numpy(np.array(self.substance_labels[idx])).long(),
            'symptom_labels': torch.from_numpy(self.symptom_labels[idx]).float()
        }
    
    def __len__(self):
        return len(self.features)

# Ensure symptom columns exist in both dataframes
missing_train_cols = [col for col in symptom_columns if col not in train_df.columns]
missing_test_cols = [col for col in symptom_columns if col not in test_df.columns]

if missing_train_cols:
    print(f"Adding missing columns to train_df: {missing_train_cols}")
    for col in missing_train_cols:
        train_df[col] = 0

if missing_test_cols:
    print(f"Adding missing columns to test_df: {missing_test_cols}")
    for col in missing_test_cols:
        test_df[col] = 0

# Get symptom data
train_symptom_data = train_df[symptom_columns].values
test_symptom_data = test_df[symptom_columns].values

print(f"Train symptom data shape: {train_symptom_data.shape}")
print(f"Test symptom data shape: {test_symptom_data.shape}")

# Create datasets with memory management
import gc

# Clear any unnecessary variables
if 'X_train_tfidf_sparse' in locals():
    del X_train_tfidf_sparse
if 'X_test_tfidf_sparse' in locals():
    del X_test_tfidf_sparse
gc.collect()

try:
    print("Creating training dataset...")
    train_dataset = TFIDFDataset(
        X_train_tfidf,
        train_df['substance_label'].values,
        train_symptom_data
    )
    
    print("Creating test dataset...")
    test_dataset = TFIDFDataset(
        X_test_tfidf,
        test_df['substance_label'].values,
        test_symptom_data
    )
    
    print("Datasets created successfully!")
    
    # Verify dataset integrity
    sample = train_dataset[0]
    print(f"Sample data shapes - Features: {sample['x'].shape}, "
          f"Substance: {sample['substance_labels'].shape}, "
          f"Symptoms: {sample['symptom_labels'].shape}")
    
    # Clean up large arrays to free memory
    del X_train_tfidf, X_test_tfidf
    gc.collect()
    print("Memory cleanup completed!")
    
except Exception as e:
    print(f"Error creating datasets: {e}")
    print("Debugging information:")
    print(f"Available memory info:")
    import psutil
    memory = psutil.virtual_memory()
    print(f"Total RAM: {memory.total / 1024**3:.1f} GB")
    print(f"Available RAM: {memory.available / 1024**3:.1f} GB")
    print(f"Used RAM: {memory.percent}%")
    raise

print("TF-IDF processing completed successfully!")

Creating TF-IDF features...
TF-IDF sparse matrix shape: (42024, 2000)
Memory usage (sparse): ~3.4 MB
Vocabulary size: 2000
Converting sparse matrices to dense (this may take a moment)...
Processed 0/42024 samples...
Processed 5000/42024 samples...
Processed 10000/42024 samples...
Processed 15000/42024 samples...
Processed 20000/42024 samples...
Processed 25000/42024 samples...
Processed 30000/42024 samples...
Processed 35000/42024 samples...
Processed 40000/42024 samples...
Processed 0/10506 samples...
Processed 5000/10506 samples...
Processed 10000/10506 samples...
Conversion completed successfully!
Final TF-IDF feature shape: (42024, 2000)
Train symptom data shape: (42024, 18)
Test symptom data shape: (10506, 18)
Creating training dataset...
Dataset created with 42024 samples
Feature shape: (42024, 2000)
Memory usage: ~320.6 MB
Creating test dataset...
Dataset created with 10506 samples
Feature shape: (10506, 2000)
Memory usage: ~80.2 MB
Datasets created successfully!
Sample data sha

4. Define Custom Model

BioBERT for multi-task classification.

In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class EnhancedMultiTaskModel(torch.nn.Module):
    def __init__(self, input_size, num_substance_classes, num_symptom_labels):
        super(EnhancedMultiTaskModel, self).__init__()
        
        # Input normalization
        self.input_norm = torch.nn.BatchNorm1d(input_size)
        
        # Enhanced architecture with residual connections
        self.hidden1 = torch.nn.Linear(input_size, 512)
        self.norm1 = torch.nn.BatchNorm1d(512)
        self.hidden2 = torch.nn.Linear(512, 256)
        self.norm2 = torch.nn.BatchNorm1d(256)
        self.hidden3 = torch.nn.Linear(256, 128)
        self.norm3 = torch.nn.BatchNorm1d(128)
        
        # Residual connection layer
        self.residual = torch.nn.Linear(input_size, 128)
        
        # Dropout with different rates
        self.dropout1 = torch.nn.Dropout(0.2)
        self.dropout2 = torch.nn.Dropout(0.3)
        self.dropout3 = torch.nn.Dropout(0.2)
        
        # Task-specific layers with attention
        self.substance_attention = torch.nn.Sequential(
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 128),
            torch.nn.Sigmoid()
        )
        
        self.symptom_attention = torch.nn.Sequential(
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 128),
            torch.nn.Sigmoid()
        )
        
        self.substance_classifier = torch.nn.Sequential(
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(64, num_substance_classes)
        )
        
        self.symptom_classifier = torch.nn.Sequential(
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(64, num_symptom_labels)
        )
        
        self.num_substance_classes = num_substance_classes
        self.num_symptom_labels = num_symptom_labels
        
        # Initialize weights
        self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, torch.nn.Linear):
                torch.nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    torch.nn.init.constant_(m.bias, 0)
            elif isinstance(m, torch.nn.BatchNorm1d):
                torch.nn.init.constant_(m.weight, 1)
                torch.nn.init.constant_(m.bias, 0)
    
    def forward(self, x, substance_labels=None, symptom_labels=None):
        # Input normalization
        x_norm = self.input_norm(x)
        
        # Forward pass through hidden layers
        hidden = torch.relu(self.hidden1(x_norm))
        hidden = self.norm1(hidden)
        hidden = self.dropout1(hidden)
        
        hidden = torch.relu(self.hidden2(hidden))
        hidden = self.norm2(hidden)
        hidden = self.dropout2(hidden)
        
        hidden = torch.relu(self.hidden3(hidden))
        hidden = self.norm3(hidden)
        
        # Residual connection
        residual = torch.relu(self.residual(x_norm))
        hidden = hidden + residual  # Add residual connection
        hidden = self.dropout3(hidden)
        
        # Task-specific attention
        substance_att = self.substance_attention(hidden)
        symptom_att = self.symptom_attention(hidden)
        
        # Apply attention
        substance_features = hidden * substance_att
        symptom_features = hidden * symptom_att
        
        # Generate logits
        substance_logits = self.substance_classifier(substance_features)
        symptom_logits = self.symptom_classifier(symptom_features)
        
        loss = None
        if substance_labels is not None and symptom_labels is not None:
            # Improved loss calculation
            
            # Focal loss for substance classification (better for imbalanced classes)
            alpha = 0.25
            gamma = 2.0
            
            # Standard cross entropy
            ce_loss = torch.nn.functional.cross_entropy(substance_logits, substance_labels, reduction='none')
            pt = torch.exp(-ce_loss)
            focal_loss = alpha * (1 - pt) ** gamma * ce_loss
            substance_loss = focal_loss.mean()
            
            # Class-balanced BCE loss for symptoms
            pos_counts = substance_labels.bincount(minlength=self.num_substance_classes).float()
            total_count = len(substance_labels)
            pos_weights = total_count / (2.0 * pos_counts + 1e-6)
            
            # For symptoms, use adaptive positive weights
            symptom_pos_counts = symptom_labels.sum(dim=0) + 1e-6
            symptom_neg_counts = (1 - symptom_labels).sum(dim=0) + 1e-6
            symptom_pos_weights = symptom_neg_counts / symptom_pos_counts
            symptom_pos_weights = torch.clamp(symptom_pos_weights, min=0.1, max=10.0)
            
            symptom_loss = torch.nn.functional.binary_cross_entropy_with_logits(
                symptom_logits, 
                symptom_labels, 
                pos_weight=symptom_pos_weights
            )
            
            # Combine losses with adaptive weighting
            substance_weight = 0.7  # Higher weight for substance classification
            symptom_weight = 0.3
            
            loss = substance_weight * substance_loss + symptom_weight * symptom_loss
        
        return {
            'loss': loss, 
            'substance_logits': substance_logits, 
            'symptom_logits': symptom_logits,
            'substance_probs': torch.softmax(substance_logits, dim=-1),
            'symptom_probs': torch.sigmoid(symptom_logits)
        }

# Get input size from the vectorizer or dataset (multiple methods)
print("Determining input size...")

# Method 1: From vectorizer (most reliable)
if 'vectorizer' in locals() and hasattr(vectorizer, 'vocabulary_'):
    actual_input_size = len(vectorizer.vocabulary_)
    print(f"Input size from vectorizer vocabulary: {actual_input_size}")
elif 'vectorizer' in locals() and hasattr(vectorizer, 'max_features'):
    actual_input_size = vectorizer.max_features
    print(f"Input size from vectorizer max_features: {actual_input_size}")
# Method 2: From dataset
elif 'train_dataset' in locals():
    sample = train_dataset[0]
    actual_input_size = sample['x'].shape[0]
    print(f"Input size from dataset sample: {actual_input_size}")
# Method 3: Check what we set in vectorizer creation
else:
    # Fallback to the value we used in vectorizer creation
    actual_input_size = 2000  # This was the max_features we set
    print(f"Using fallback input size: {actual_input_size}")
    print("Warning: Using fallback size. Ensure this matches your vectorizer configuration.")

# Verify the input size is correct
if 'train_dataset' in locals():
    sample = train_dataset[0]
    sample_input_size = sample['x'].shape[0]
    if sample_input_size != actual_input_size:
        print(f"WARNING: Mismatch detected!")
        print(f"Calculated input size: {actual_input_size}")
        print(f"Actual dataset input size: {sample_input_size}")
        actual_input_size = sample_input_size
        print(f"Using dataset input size: {actual_input_size}")

print(f"Final input size: {actual_input_size}")
print(f"Substance classes: {len(substance_classes)}")
print(f"Symptom labels: {len(symptom_columns)}")

# Create the model
model = EnhancedMultiTaskModel(
    input_size=actual_input_size,
    num_substance_classes=len(substance_classes),
    num_symptom_labels=len(symptom_columns)
)

# Set device (make sure device is defined)
if 'device' not in locals():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device set to: {device}")

model.to(device)

# Print model info
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nModel created successfully!")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model device: {next(model.parameters()).device}")

# Model summary
print("\nModel Architecture:")
print(f"Input size: {actual_input_size}")
print(f"Substance classes: {len(substance_classes)}")
print(f"Symptom labels: {len(symptom_columns)}")
print(f"Hidden layers: 512 -> 256 -> 128")
print("Features: Batch normalization, residual connections, attention mechanisms, focal loss")

# Test model with a sample batch to ensure everything works
if 'train_dataset' in locals():
    print("\nTesting model with sample batch...")
    try:
        sample_batch = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=False)
        batch = next(iter(sample_batch))
        
        with torch.no_grad():
            x = batch['x'].to(device)
            substance_labels = batch['substance_labels'].to(device)
            symptom_labels = batch['symptom_labels'].to(device)
            
            outputs = model(x, substance_labels=substance_labels, symptom_labels=symptom_labels)
            
            print(f"✓ Model test successful!")
            print(f"  Loss: {outputs['loss'].item():.4f}")
            print(f"  Substance logits shape: {outputs['substance_logits'].shape}")
            print(f"  Symptom logits shape: {outputs['symptom_logits'].shape}")
    except Exception as e:
        print(f"✗ Model test failed: {e}")
        raise
else:
    print("Warning: train_dataset not available for testing")

Determining input size...
Input size from vectorizer vocabulary: 2000
Final input size: 2000
Substance classes: 3
Symptom labels: 18

Model created successfully!
Total parameters: 1,501,685
Trainable parameters: 1,501,685
Model device: cpu

Model Architecture:
Input size: 2000
Substance classes: 3
Symptom labels: 18
Hidden layers: 512 -> 256 -> 128
Features: Batch normalization, residual connections, attention mechanisms, focal loss

Testing model with sample batch...
✓ Model test successful!
  Loss: 0.2391
  Substance logits shape: torch.Size([2, 3])
  Symptom logits shape: torch.Size([2, 18])


5. Train Model


In [44]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import numpy as np
import gc
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
from torch.optim.lr_scheduler import LambdaLR

# Custom training function with enhancements for >95% accuracy
def train_model(model, train_dataset, test_dataset, num_epochs=100, batch_size=32, learning_rate=2e-4):
    # Create data loaders with larger batch size for stability
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
    
    # Optimizer with weight decay and gradient clipping
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    
    # Learning rate scheduler with warmup
    warmup_epochs = 10
    def lr_lambda(epoch):
        if epoch < warmup_epochs:
            return float(epoch + 1) / warmup_epochs  # Linear warmup
        return 0.5 * (1 + np.cos(np.pi * (epoch - warmup_epochs) / (num_epochs - warmup_epochs)))  # Cosine decay
    scheduler = LambdaLR(optimizer, lr_lambda)
    
    # Training history
    history = {
        'train_loss': [], 'val_loss': [],
        'substance_acc': [], 'symptom_f1': [],
        'symptom_precision': [], 'symptom_recall': []
    }
    
    best_substance_acc = 0.0
    best_model_state = None
    
    print("Starting training...")
    print(f"Total epochs: {num_epochs}")
    print(f"Batch size: {batch_size}")
    print(f"Initial learning rate: {learning_rate}")
    print(f"Total training batches: {len(train_loader)}")
    print(f"Total validation batches: {len(test_loader)}")
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        train_batches = 0
        
        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()
            
            # Move data to device
            x = batch['x'].to(device)
            substance_labels = batch['substance_labels'].to(device)
            symptom_labels = batch['symptom_labels'].to(device)
            
            # Forward pass
            outputs = model(x, substance_labels=substance_labels, symptom_labels=symptom_labels)
            loss = outputs['loss']
            
            # Backward pass with gradient clipping
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
            optimizer.step()
            
            train_loss += loss.item()
            train_batches += 1
            
            # Log progress every 50 batches
            if batch_idx % 50 == 0:
                print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, "
                      f"Loss: {loss.item():.4f}")
        
        avg_train_loss = train_loss / train_batches
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_batches = 0
        all_substance_preds = []
        all_substance_labels = []
        all_symptom_preds = []
        all_symptom_labels = []
        
        with torch.no_grad():
            for batch in test_loader:
                x = batch['x'].to(device)
                substance_labels = batch['substance_labels'].to(device)
                symptom_labels = batch['symptom_labels'].to(device)
                
                outputs = model(x, substance_labels=substance_labels, symptom_labels=symptom_labels)
                loss = outputs['loss']
                
                val_loss += loss.item()
                val_batches += 1
                
                # Collect predictions
                substance_preds = torch.argmax(outputs['substance_logits'], dim=1)
                symptom_preds = (torch.sigmoid(outputs['symptom_logits']) > 0.5).float()
                
                all_substance_preds.extend(substance_preds.cpu().numpy())
                all_substance_labels.extend(substance_labels.cpu().numpy())
                all_symptom_preds.extend(symptom_preds.cpu().numpy())
                all_symptom_labels.extend(symptom_labels.cpu().numpy())
        
        avg_val_loss = val_loss / val_batches
        
        # Calculate metrics
        all_substance_preds = np.array(all_substance_preds)
        all_substance_labels = np.array(all_substance_labels)
        all_symptom_preds = np.array(all_symptom_preds)
        all_symptom_labels = np.array(all_symptom_labels)
        
        substance_accuracy = accuracy_score(all_substance_labels, all_substance_preds)
        symptom_f1 = f1_score(all_symptom_labels, all_symptom_preds, average='micro', zero_division=0)
        symptom_precision = precision_score(all_symptom_labels, all_symptom_preds, average='micro', zero_division=0)
        symptom_recall = recall_score(all_symptom_labels, all_symptom_preds, average='micro', zero_division=0)
        
        # Update learning rate
        scheduler.step()
        
        # Save best model
        if substance_accuracy > best_substance_acc:
            best_substance_acc = substance_accuracy
            best_model_state = model.state_dict().copy()
            torch.save(best_model_state, 'best_model.pth')
            print(f"New best model saved! Substance accuracy: {best_substance_acc:.4f}")
        
        # Store history
        history['train_loss'].append(avg_train_loss)
        history['val_loss'].append(avg_val_loss)
        history['substance_acc'].append(substance_accuracy)
        history['symptom_f1'].append(symptom_f1)
        history['symptom_precision'].append(symptom_precision)
        history['symptom_recall'].append(symptom_recall)
        
        # Print epoch results
        print(f"\nEpoch {epoch+1}/{num_epochs} Results:")
        print(f"Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
        print(f"Substance Accuracy: {substance_accuracy:.4f}")
        print(f"Symptom F1: {symptom_f1:.4f}, Precision: {symptom_precision:.4f}, Recall: {symptom_recall:.4f}")
        print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.6f}")
        print("-" * 60)
        
        # Early stopping if accuracy target is met
        if substance_accuracy >= 0.95:
            print(f"Target accuracy of 95% reached at epoch {epoch+1}!")
            break
        
        # Clear cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()

    # Load best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print(f"\nLoaded best model with substance accuracy: {best_substance_acc:.4f}")

    return model, history

# Enhanced evaluation function
def evaluate_model(model, test_dataset, batch_size=32):
    model.eval()
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    all_substance_preds = []
    all_substance_labels = []
    all_symptom_preds = []
    all_symptom_labels = []
    all_substance_probs = []
    all_symptom_probs = []
    
    with torch.no_grad():
        for batch in test_loader:
            x = batch['x'].to(device)
            substance_labels = batch['substance_labels'].to(device)
            symptom_labels = batch['symptom_labels'].to(device)
            
            outputs = model(x)
            
            substance_preds = torch.argmax(outputs['substance_logits'], dim=1)
            symptom_preds = (torch.sigmoid(outputs['symptom_logits']) > 0.5).float()
            
            all_substance_preds.extend(substance_preds.cpu().numpy())
            all_substance_labels.extend(substance_labels.cpu().numpy())
            all_symptom_preds.extend(symptom_preds.cpu().numpy())
            all_symptom_labels.extend(symptom_labels.cpu().numpy())
            all_substance_probs.extend(torch.softmax(outputs['substance_logits'], dim=1).cpu().numpy())
            all_symptom_probs.extend(torch.sigmoid(outputs['symptom_logits']).cpu().numpy())

    return {
        'substance_preds': np.array(all_substance_preds),
        'substance_labels': np.array(all_substance_labels),
        'symptom_preds': np.array(all_symptom_preds),
        'symptom_labels': np.array(all_symptom_labels),
        'substance_probs': np.array(all_substance_probs),
        'symptom_probs': np.array(all_symptom_probs)
    }

# Simple text-based visualization function
def print_training_summary(history):
    print("\n" + "="*60)
    print("TRAINING SUMMARY")
    print("="*60)
    
    epochs = len(history['train_loss'])
    
    print(f"Total Epochs: {epochs}")
    print(f"Final Train Loss: {history['train_loss'][-1]:.4f}")
    print(f"Final Val Loss: {history['val_loss'][-1]:.4f}")
    print(f"Best Substance Accuracy: {max(history['substance_acc']):.4f}")
    print(f"Final Symptom F1: {history['symptom_f1'][-1]:.4f}")
    
    print("\nEpoch-by-Epoch Progress:")
    print("Epoch | Train Loss | Val Loss | Substance Acc | Symptom F1")
    print("-" * 60)
    
    for i in range(epochs):
        print(f"{i+1:5d} | {history['train_loss'][i]:10.4f} | {history['val_loss'][i]:8.4f} | "
              f"{history['substance_acc'][i]:13.4f} | {history['symptom_f1'][i]:10.4f}")

# Save training history to CSV
def save_training_history(history, filename='training_history.csv'):
    import csv
    
    with open(filename, 'w', newline='') as csvfile:
        fieldnames = ['epoch'] + list(history.keys())
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for i in range(len(history['train_loss'])):
            row = {'epoch': i+1}
            for key in history.keys():
                row[key] = history[key][i]
            writer.writerow(row)
    
    print(f"Training history saved to {filename}")

# Clear memory before training
torch. cuda.empty_cache() if torch.cuda.is_available() else None
gc.collect()

# Train the model with optimized parameters
print("Starting model training...")
trained_model, training_history = train_model(
    model=model,
    train_dataset=train_dataset,
    test_dataset=test_dataset,
    num_epochs=100,  # Set to 100 as requested
    batch_size=32,   # Increased for better gradient estimates
    learning_rate=2e-4  # Lowered for more stable convergence
)

print("\nTraining completed!")
print_training_summary(training_history)
save_training_history(training_history)

Starting model training...
Starting training...
Total epochs: 100
Batch size: 32
Initial learning rate: 0.0002
Total training batches: 1314
Total validation batches: 329
Epoch 1/100, Batch 0/1314, Loss: 0.3723
Epoch 1/100, Batch 50/1314, Loss: 0.3095
Epoch 1/100, Batch 100/1314, Loss: 0.2929
Epoch 1/100, Batch 150/1314, Loss: 0.2714
Epoch 1/100, Batch 200/1314, Loss: 0.2335
Epoch 1/100, Batch 250/1314, Loss: 0.2180
Epoch 1/100, Batch 300/1314, Loss: 0.2144
Epoch 1/100, Batch 350/1314, Loss: 0.2313
Epoch 1/100, Batch 400/1314, Loss: 0.2006
Epoch 1/100, Batch 450/1314, Loss: 0.1696
Epoch 1/100, Batch 500/1314, Loss: 0.1798
Epoch 1/100, Batch 550/1314, Loss: 0.1579
Epoch 1/100, Batch 600/1314, Loss: 0.1469
Epoch 1/100, Batch 650/1314, Loss: 0.1357
Epoch 1/100, Batch 700/1314, Loss: 0.1596
Epoch 1/100, Batch 750/1314, Loss: 0.1180
Epoch 1/100, Batch 800/1314, Loss: 0.1640
Epoch 1/100, Batch 850/1314, Loss: 0.1448
Epoch 1/100, Batch 900/1314, Loss: 0.1041
Epoch 1/100, Batch 950/1314, Loss: 

6. Evaluate Model

Evaluate and print results.

In [45]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import torch

# Replace the trainer evaluation code with this:

# Evaluate the trained model
print("\nEvaluating trained model...")
eval_results = evaluate_model(trained_model, test_dataset, batch_size=16)

# Print evaluation metrics
substance_accuracy = accuracy_score(eval_results['substance_labels'], eval_results['substance_preds'])
symptom_f1 = f1_score(eval_results['symptom_labels'], eval_results['symptom_preds'], average='micro', zero_division=0)
symptom_precision = precision_score(eval_results['symptom_labels'], eval_results['symptom_preds'], average='micro', zero_division=0)
symptom_recall = recall_score(eval_results['symptom_labels'], eval_results['symptom_preds'], average='micro', zero_division=0)

print(f'Final Evaluation Results:')
print(f'Substance Accuracy: {substance_accuracy:.4f}')
print(f'Symptom F1 Score: {symptom_f1:.4f}')
print(f'Symptom Precision: {symptom_precision:.4f}')
print(f'Symptom Recall: {symptom_recall:.4f}')

# Get predictions for classification reports
substance_preds = eval_results['substance_preds']
symptom_preds = eval_results['symptom_preds']

# Make sure you have these variables defined (they should be from your data preprocessing)
# If not, you'll need to extract them from your datasets
print('\nSubstance Classification Report:')
print(classification_report(eval_results['substance_labels'], substance_preds,
                           target_names=substance_classes, zero_division=0))

print('\nSymptom Classification Report:')
print(classification_report(eval_results['symptom_labels'], symptom_preds,
                           target_names=symptom_columns, zero_division=0))

# Text-based training history visualization (alternative to matplotlib)
def print_training_charts(history):
    """
    Create text-based charts for training history
    """
    print("\n" + "="*80)
    print("TRAINING HISTORY VISUALIZATION")
    print("="*80)
    
    epochs = len(history['train_loss'])
    
    # Loss chart
    print("\n📉 LOSS TRENDS:")
    print("-" * 50)
    max_train_loss = max(history['train_loss'])
    max_val_loss = max(history['val_loss'])
    max_loss = max(max_train_loss, max_val_loss)
    
    for i in range(epochs):
        train_bar = int((history['train_loss'][i] / max_loss) * 30)
        val_bar = int((history['val_loss'][i] / max_loss) * 30)
        
        print(f"E{i+1:2d} Train: {'█' * train_bar:<30} {history['train_loss'][i]:.4f}")
        print(f"    Val  : {'█' * val_bar:<30} {history['val_loss'][i]:.4f}")
        print()
    
    # Accuracy chart
    print("\n📊 SUBSTANCE ACCURACY TRENDS:")
    print("-" * 50)
    for i in range(epochs):
        acc_bar = int(history['substance_acc'][i] * 40)
        print(f"E{i+1:2d}: {'█' * acc_bar:<40} {history['substance_acc'][i]:.4f}")
    
    # F1 Score chart
    print("\n🎯 SYMPTOM F1 SCORE TRENDS:")
    print("-" * 50)
    max_f1 = max(history['symptom_f1']) if max(history['symptom_f1']) > 0 else 1
    for i in range(epochs):
        f1_bar = int((history['symptom_f1'][i] / max_f1) * 40)
        print(f"E{i+1:2d}: {'█' * f1_bar:<40} {history['symptom_f1'][i]:.4f}")
    
    # Summary statistics
    print("\n📈 TRAINING SUMMARY STATISTICS:")
    print("-" * 50)
    print(f"Best Substance Accuracy: {max(history['substance_acc']):.4f} (Epoch {history['substance_acc'].index(max(history['substance_acc'])) + 1})")
    print(f"Best Symptom F1 Score: {max(history['symptom_f1']):.4f} (Epoch {history['symptom_f1'].index(max(history['symptom_f1'])) + 1})")
    print(f"Final Train Loss: {history['train_loss'][-1]:.4f}")
    print(f"Final Val Loss: {history['val_loss'][-1]:.4f}")
    print(f"Loss Improvement: {((history['train_loss'][0] - history['train_loss'][-1]) / history['train_loss'][0] * 100):.1f}%")

# HTML-based visualization (alternative approach)
def create_html_charts(history, filename='training_charts.html'):
    """
    Create an HTML file with interactive charts using Chart.js
    """
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>Training History</title>
        <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 20px; }}
            .chart-container {{ width: 45%; display: inline-block; margin: 20px; }}
            h1 {{ text-align: center; color: #333; }}
            h2 {{ color: #666; }}
        </style>
    </head>
    <body>
        <h1>Training History Dashboard</h1>
        
        <div class="chart-container">
            <h2>Loss Over Time</h2>
            <canvas id="lossChart"></canvas>
        </div>
        
        <div class="chart-container">
            <h2>Substance Accuracy</h2>
            <canvas id="accuracyChart"></canvas>
        </div>
        
        <div class="chart-container">
            <h2>Symptom F1 Score</h2>
            <canvas id="f1Chart"></canvas>
        </div>
        
        <div class="chart-container">
            <h2>Symptom Precision & Recall</h2>
            <canvas id="precisionRecallChart"></canvas>
        </div>
        
        <script>
            const epochs = {list(range(1, len(history['train_loss']) + 1))};
            
            // Loss Chart
            new Chart(document.getElementById('lossChart'), {{
                type: 'line',
                data: {{
                    labels: epochs,
                    datasets: [{{
                        label: 'Training Loss',
                        data: {history['train_loss']},
                        borderColor: 'rgb(255, 99, 132)',
                        backgroundColor: 'rgba(255, 99, 132, 0.2)',
                    }}, {{
                        label: 'Validation Loss',
                        data: {history['val_loss']},
                        borderColor: 'rgb(54, 162, 235)',
                        backgroundColor: 'rgba(54, 162, 235, 0.2)',
                    }}]
                }},
                options: {{
                    responsive: true,
                    scales: {{
                        y: {{ beginAtZero: true }}
                    }}
                }}
            }});
            
            // Accuracy Chart
            new Chart(document.getElementById('accuracyChart'), {{
                type: 'line',
                data: {{
                    labels: epochs,
                    datasets: [{{
                        label: 'Substance Accuracy',
                        data: {history['substance_acc']},
                        borderColor: 'rgb(75, 192, 192)',
                        backgroundColor: 'rgba(75, 192, 192, 0.2)',
                    }}]
                }},
                options: {{
                    responsive: true,
                    scales: {{
                        y: {{ beginAtZero: true, max: 1 }}
                    }}
                }}
            }});
            
            // F1 Chart
            new Chart(document.getElementById('f1Chart'), {{
                type: 'line',
                data: {{
                    labels: epochs,
                    datasets: [{{
                        label: 'Symptom F1 Score',
                        data: {history['symptom_f1']},
                        borderColor: 'rgb(255, 206, 86)',
                        backgroundColor: 'rgba(255, 206, 86, 0.2)',
                    }}]
                }},
                options: {{
                    responsive: true,
                    scales: {{
                        y: {{ beginAtZero: true, max: 1 }}
                    }}
                }}
            }});
            
            // Precision & Recall Chart
            new Chart(document.getElementById('precisionRecallChart'), {{
                type: 'line',
                data: {{
                    labels: epochs,
                    datasets: [{{
                        label: 'Precision',
                        data: {history['symptom_precision']},
                        borderColor: 'rgb(153, 102, 255)',
                        backgroundColor: 'rgba(153, 102, 255, 0.2)',
                    }}, {{
                        label: 'Recall',
                        data: {history['symptom_recall']},
                        borderColor: 'rgb(255, 159, 64)',
                        backgroundColor: 'rgba(255, 159, 64, 0.2)',
                    }}]
                }},
                options: {{
                    responsive: true,
                    scales: {{
                        y: {{ beginAtZero: true, max: 1 }}
                    }}
                }}
            }});
        </script>
    </body>
    </html>
    """
    
    with open(filename, 'w') as f:
        f.write(html_content)
    
    print(f"\n📊 Interactive charts saved to {filename}")
    print("Open this file in your web browser to view the charts!")

# Use text-based visualization
print_training_charts(training_history)

# Create HTML charts (optional - creates a file you can open in browser)
create_html_charts(training_history)

# Enhanced CSV export with more details
def save_detailed_results(history, eval_results, filename='detailed_results.csv'):
    """
    Save comprehensive results including training history and final evaluation
    """
    import csv
    
    # Training history
    with open(f'training_{filename}', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['epoch', 'train_loss', 'val_loss', 'substance_acc', 'symptom_f1', 'symptom_precision', 'symptom_recall'])
        for i in range(len(history['train_loss'])):
            writer.writerow([
                i+1, history['train_loss'][i], history['val_loss'][i],
                history['substance_acc'][i], history['symptom_f1'][i],
                history['symptom_precision'][i], history['symptom_recall'][i]
            ])
    
    # Final evaluation results
    with open(f'evaluation_{filename}', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['metric', 'value'])
        writer.writerow(['substance_accuracy', substance_accuracy])
        writer.writerow(['symptom_f1', symptom_f1])
        writer.writerow(['symptom_precision', symptom_precision])
        writer.writerow(['symptom_recall', symptom_recall])
    
    print(f"📁 Detailed results saved to training_{filename} and evaluation_{filename}")

# Save comprehensive results
save_detailed_results(training_history, eval_results)

# Optional: Save the trained model
torch.save(trained_model.state_dict(), 'best_model.pth')
print("💾 Model saved as 'best_model.pth'")

print("\n🎉 Evaluation completed! Check the generated files for detailed results.")


Evaluating trained model...
Final Evaluation Results:
Substance Accuracy: 0.9606
Symptom F1 Score: 0.8670
Symptom Precision: 0.8985
Symptom Recall: 0.8377

Substance Classification Report:
              precision    recall  f1-score   support

        none       1.00      0.89      0.94      3676
      opioid       0.90      1.00      0.95      3502
   stimulant       1.00      1.00      1.00      3328

    accuracy                           0.96     10506
   macro avg       0.96      0.96      0.96     10506
weighted avg       0.96      0.96      0.96     10506


Symptom Classification Report:
               precision    recall  f1-score   support

adverse_event       0.90      0.95      0.92      4559
      anxiety       0.04      0.20      0.06         5
    confusion       0.99      0.99      0.99       628
 constipation       0.00      0.00      0.00         2
    dizziness       0.67      1.00      0.80        35
   drowsiness       0.40      0.67      0.50         3
      dyspn

7. Save Model

In [46]:
import pickle
import csv

torch.save(model.state_dict(), './tfidf_drug_use_model.pt')
with open('./tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

model_info = {
    'model_state_dict': model.state_dict(),
    'model_config': {
        'input_size': 10000,
        'num_substance_classes': len(substance_classes),
        'num_symptom_labels': len(symptom_columns)
    },
    'substance_classes': substance_classes,
    'symptom_columns': symptom_columns,
    'training_history': training_history
}
torch.save(model_info, './complete_model_info.pt')

with open('training_history.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['epoch', 'train_loss', 'val_loss', 'substance_acc', 'symptom_f1', 'symptom_precision', 'symptom_recall'])
    for i in range(len(training_history['train_loss'])):
        writer.writerow([
            i+1, training_history['train_loss'][i], training_history['val_loss'][i],
            training_history['substance_acc'][i], training_history['symptom_f1'][i],
            training_history['symptom_precision'][i], training_history['symptom_recall'][i]
        ])

with open('evaluation_results.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['metric', 'value'])
    writer.writerow(['substance_accuracy', substance_accuracy])
    writer.writerow(['symptom_f1', symptom_f1])
    writer.writerow(['symptom_precision', symptom_precision])
    writer.writerow(['symptom_recall', symptom_recall])

print('Model, vectorizer, and results saved!')

Model, vectorizer, and results saved!
