In [8]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datetime import datetime



class HealthcareRAG:
    def __init__(self):
        # Initialize embedding model - runs on CPU
        self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Initialize ChromaDB
        self.chroma_client = chromadb.Client()
        
        # Create collections for different data types
        self.patient_collection = self.chroma_client.create_collection("patient_data")
        self.schemes_collection = self.chroma_client.create_collection("schemes_data")
        
        # Initialize tokenizer and model (example with Phi-2)
        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
        self.model = AutoModelForCausalLM.from_pretrained(
            "microsoft/phi-2",
            torch_dtype=torch.float32,
            device_map="cpu"
        )

    def process_patient_data(self, data_csv):
        df = pd.read_csv(data_csv)
        
        # Process each category
        for _, row in df.iterrows():
            # Create time series analysis for each category
            category_data = {
                'category': row['Category'],
                'category_name': row['Category Name'],
                'data_code': row['Data Code'],
                'data_name': row['Data Name']
            }
            
            # Calculate trend
            time_series = row.iloc[4:].astype(float)  # Starting from April 2023
            category_data['trend'] = self.calculate_trend(time_series)
            
            # Store in ChromaDB
            self.patient_collection.add(
                documents=[str(category_data)],
                metadatas=[category_data],
                ids=[f"patient_{row['Category']}_{row['Data Code']}"]
            )

    def process_schemes_data(self, schemes_csv):
        df = pd.read_csv(schemes_csv)
        
        for _, row in df.iterrows():
            scheme_data = row.to_dict()
            
            # Store in ChromaDB
            self.schemes_collection.add(
                documents=[str(scheme_data)],
                metadatas=[scheme_data],
                ids=[f"scheme_{row['Category']}_{row['Scheme Name']}"]
            )

    def calculate_trend(self, time_series):
        # Calculate basic trend indicators
        mean = time_series.mean()
        std = time_series.std()
        growth_rate = (time_series.iloc[-1] - time_series.iloc[0]) / time_series.iloc[0] * 100
        
        return {
            'mean': mean,
            'std': std,
            'growth_rate': growth_rate,
            'trend_direction': 'increasing' if growth_rate > 0 else 'decreasing'
        }

    def analyze_category(self, category_name):
        # Retrieve relevant patient data
        patient_results = self.patient_collection.query(
            query_texts=[category_name],
            n_results=5
        )
        
        # Retrieve relevant schemes
        scheme_results = self.schemes_collection.query(
            query_texts=[category_name],
            n_results=5
        )
        
        # Prepare prompt for analysis
        prompt = f"""
        Analyze the following healthcare category: {category_name}
        
        Patient Data Trends:
        {patient_results}
        
        Related Government Schemes:
        {scheme_results}
        
        Please provide:
        1. Summary of patient trends
        2. Analysis of existing schemes' effectiveness
        3. Suggestions for new schemes based on trends
        """
        
        # Generate analysis using local LLM
        inputs = self.tokenizer(prompt, return_tensors="pt")
        outputs = self.model.generate(
            inputs.input_ids,
            max_length=1000,
            temperature=0.7,
            num_return_sequences=1
        )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def suggest_new_scheme(self, category_name, trend_analysis):
        # Create prompt for scheme suggestion
        prompt = f"""
        Based on the following trend analysis for {category_name}:
        {trend_analysis}
        
        Suggest a new government scheme that addresses:
        1. Current gaps in coverage
        2. Emerging trends in patient needs
        3. Implementation strategy
        """
        
        inputs = self.tokenizer(prompt, return_tensors="pt")
        outputs = self.model.generate(
            inputs.input_ids,
            max_length=800,
            temperature=0.8,
            num_return_sequences=1
        )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

In [9]:
# Initialize the system
rag_system = HealthcareRAG()

# Process your data
rag_system.process_patient_data('Data.csv')
rag_system.process_schemes_data('schemes.csv')

# Analyze a specific category
analysis = rag_system.analyze_category('Pregnancy')
new_scheme = rag_system.suggest_new_scheme('Pregnancy', analysis)

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=0.26.0'`