In [1]:
pip install google-generativeai


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import google.generativeai as genai

# Set your API key
os.environ["GEMINI_API_KEY"] = "AIzaSyDMQqWo7lFadAOXpvee5YSvm3RMVApBi1g"
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# Create the model configuration
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 8192,  # Maximum number of tokens in the output
    "response_mime_type": "text/plain",
}

# Initialize the Generative Model
model = genai.GenerativeModel(
    model_name="gemini-1.5-pro-002",  # Use the correct model name as needed
    generation_config=generation_config,
)

# Start a chat session
chat_session = model.start_chat(history=[])

# Send a message (prompt) to the model
try:
    response = chat_session.send_message("Explain the impact of machine learning in healthcare.")
    # Print the response from the Gemini model
    print(response.text)

except genai.generation_types.StopCandidateException as e:
    # Handle the exception, log or print the citation if needed
    print("Recitation detected in the response:")
    print(e.candidate.citation_metadata)



Machine learning (ML) is rapidly transforming healthcare, offering the potential to improve diagnostics, treatments, and overall patient care.  Here's a breakdown of its impact:

**1. Disease Diagnosis and Prediction:**

* **Image Analysis:** ML algorithms excel at analyzing medical images like X-rays, CT scans, and MRIs, helping to detect diseases like cancer, pneumonia, and eye conditions earlier and more accurately than traditional methods. They can also identify subtle patterns that might be missed by human eyes.
* **Risk Prediction:** By analyzing patient data (medical history, genetics, lifestyle), ML models can predict the likelihood of developing certain diseases like diabetes, heart disease, or Alzheimer's. This allows for early interventions and preventive measures.

**2. Personalized Treatment and Medicine:**

* **Drug Discovery and Development:** ML accelerates the process of discovering and developing new drugs by identifying potential drug candidates, predicting their eff

In [3]:
import pandas as pd
import time
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Load your dataset
df = pd.read_csv('labeled_comments_cleaned.csv')
df.fillna('', inplace=True)

# Function to generate zero-shot prompt
def create_zero_shot_prompt(comment):
    prompt = f"""
    Classify the following comment into the appropriate categories for the hierarchical levels:

    Comment: "{comment}"

    Predict the following levels:
    - Level 0: [Information Exchange, Modification, Social Communication, Other]
    - Level 1: [Requested, Execution, Discussion, Provided, Feedback, Acknowledgment]
    - Level 2: [Requesting Confirmation, Promise, Content, Reference, Context, Thread, Done, Format, Asking Details]
    - Level 3: [Potential Change, Not Potential Change, Explicit, Not Explicit]
    - Level 4: [Change, Add, Delete]

    Return the predicted categories for each level as 'Level 0: category, Level 1: category, ...'.
    """
    return prompt

# Function to parse Gemini's response to extract hierarchical levels
def parse_gemini_response(response_text):
    levels = {}
    for line in response_text.split("\n"):
        if "Level 0" in line:
            levels['level_0'] = line.split(":")[1].strip()
        elif "Level 1" in line:
            levels['level_1'] = line.split(":")[1].strip()
        elif "Level 2" in line:
            levels['level_2'] = line.split(":")[1].strip()
        elif "Level 3" in line:
            levels['level_3'] = line.split(":")[1].strip()
        elif "Level 4" in line:
            levels['level_4'] = line.split(":")[1].strip()
    return levels

# Processing comments with Zero-shot learning
batch_size = 500# Adjust the batch size as needed
gemini_responses = []

for start_idx in range(0, len(df), batch_size):
    end_idx = min(start_idx + batch_size, len(df))
    batch = df['comment_full_text'].iloc[start_idx:end_idx]  # Get the current batch of comments

    print(f"Processing comments {start_idx+1} to {end_idx}...")

    for idx, comment in enumerate(batch):
        print(f"Processing comment {start_idx + idx + 1}/{len(df)}: {comment[:50]}...")

        try:
            # Generate zero-shot prompt
            prompt = create_zero_shot_prompt(comment)
            
            # Send prompt to Gemini (Assuming chat_session is already initialized)
            response = chat_session.send_message(prompt)
            gemini_responses.append(response.text.strip())  # Store response
        except Exception as e:
            print(f"Error processing comment {start_idx + idx + 1}: {e}")
            gemini_responses.append("Error")

    # Save progress after every batch
    temp_df = df.iloc[:len(gemini_responses)].copy()
    temp_df['gemini_responses'] = gemini_responses
    temp_df.to_csv(f'gemini_responses_progress_{end_idx}.csv', index=False)

    print(f"Batch {start_idx+1} to {end_idx} complete! Saved progress to 'gemini_responses_progress_{end_idx}.csv'.")

    # Optional: Pause between batches to avoid rate limits (adjust sleep time as needed)
    time.sleep(30)  # Wait 30 seconds between batches

# Ensure all comments are processed, if not extend the list
if len(gemini_responses) < len(df):
    gemini_responses.extend(["Error"] * (len(df) - len(gemini_responses)))

# Add the generated responses to the DataFrame
df['gemini_responses'] = gemini_responses

# Save the final results to a CSV file
df.to_csv('gemini_classified_comments_zero_shot.csv', index=False)

print("All batches processed! Final results saved to 'gemini_classified_comments_zero_shot.csv'.")

# Function to evaluate and calculate metrics
def evaluate_metrics(df):
    # True labels
    y_true = df[['level_0', 'level_1', 'level_2', 'level_3', 'level_4']].values
    
    # Parse the responses from Gemini into hierarchical levels
    y_pred = [parse_gemini_response(resp) for resp in df['gemini_responses']]
    
    # Separate the true and predicted values for each level
    metrics_results = {}
    for i in range(5):  # For levels 0 through 4
        level = f"level_{i}"
        y_true_level = [row[i].strip().lower() for row in y_true]
        y_pred_level = [row[level].strip().lower() for row in y_pred if level in row]

        # Calculate F1-score, Precision, Recall, and Accuracy for the level
        f1 = f1_score(y_true_level, y_pred_level, average='weighted')
        precision = precision_score(y_true_level, y_pred_level, average='weighted')
        recall = recall_score(y_true_level, y_pred_level, average='weighted')
        accuracy = accuracy_score(y_true_level, y_pred_level)

        metrics_results[level] = {
            'f1_score': f1,
            'precision': precision,
            'recall': recall,
            'accuracy': accuracy
        }
        
        print(f"{level} - F1 Score: {f1}, Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}")

    return metrics_results

# Evaluate the model's performance on the test data
metrics_results = evaluate_metrics(df)

# Optionally save the metrics results to a file
metrics_df = pd.DataFrame(metrics_results)
metrics_df.to_csv('gemini_zero_shot_metrics.csv', index=False)

print("Evaluation metrics saved to 'gemini_zero_shot_metrics.csv'.")


Processing comments 1 to 500...
Processing comment 1/4991: Ok?...
Processing comment 2/4991: This has been discussed in the Executive Summary v...
Processing comment 3/4991: This has been discussed in the Executive Summary v...
Processing comment 4/4991: This has been discussed in the Executive Summary v...
Processing comment 5/4991: CODING

Code qualitative data for WAVGUAGE03A...
Processing comment 6/4991: The PROPOSAL section is not to be bound within the...
Processing comment 7/4991: Insert most recent fixed index price....
Processing comment 8/4991: *  This clause is likely to be contentious. It's s...
Processing comment 9/4991: *  This clause is likely to be contentious. It's s...
Processing comment 10/4991: *  This clause is likely to be contentious. It's s...
Processing comment 11/4991: This section was moved from the previous rule 206....
Processing comment 12/4991: A good plan...
Processing comment 13/4991: Moved from E...
Processing comment 14/4991: “Performs routine and spe

IndexError: list index out of range

In [4]:
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Function to parse Gemini's response to extract hierarchical levels
def parse_gemini_response(response_text):
    levels = {'level_0': 'Unknown', 'level_1': 'Unknown', 'level_2': 'Unknown', 'level_3': 'Unknown', 'level_4': 'Unknown'}  # Default values

    for line in response_text.split("\n"):
        if "Level 0" in line:
            levels['level_0'] = line.split(":")[1].strip() if len(line.split(":")) > 1 else "Unknown"
        elif "Level 1" in line:
            levels['level_1'] = line.split(":")[1].strip() if len(line.split(":")) > 1 else "Unknown"
        elif "Level 2" in line:
            levels['level_2'] = line.split(":")[1].strip() if len(line.split(":")) > 1 else "Unknown"
        elif "Level 3" in line:
            levels['level_3'] = line.split(":")[1].strip() if len(line.split(":")) > 1 else "Unknown"
        elif "Level 4" in line:
            levels['level_4'] = line.split(":")[1].strip() if len(line.split(":")) > 1 else "Unknown"
    return levels

# Function to evaluate and calculate metrics
def evaluate_metrics(df):
    # True labels
    y_true = df[['level_0', 'level_1', 'level_2', 'level_3', 'level_4']].values
    
    # Parse the responses from Gemini into hierarchical levels
    y_pred = [parse_gemini_response(resp) for resp in df['gemini_responses']]
    
    # Separate the true and predicted values for each level
    metrics_results = {}
    for i in range(5):  # For levels 0 through 4
        level = f"level_{i}"
        y_true_level = [row[i].strip().lower() for row in y_true]
        y_pred_level = [row[level].strip().lower() for row in y_pred if level in row]

        # Calculate F1-score, Precision, Recall, and Accuracy for the level
        f1 = f1_score(y_true_level, y_pred_level, average='weighted', zero_division=1)
        precision = precision_score(y_true_level, y_pred_level, average='weighted', zero_division=1)
        recall = recall_score(y_true_level, y_pred_level, average='weighted', zero_division=1)
        accuracy = accuracy_score(y_true_level, y_pred_level)

        metrics_results[level] = {
            'f1_score': f1,
            'precision': precision,
            'recall': recall,
            'accuracy': accuracy
        }
        
        print(f"{level} - F1 Score: {f1}, Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}")

    return metrics_results

# Load the partially classified dataset
df = pd.read_csv('gemini_responses_progress_1500.csv')  # Your file where 1500 comments are classified

# Evaluate the model's performance on the 1500 test data
metrics_results = evaluate_metrics(df)

# Optionally save the metrics results to a file
metrics_df = pd.DataFrame(metrics_results)
metrics_df.to_csv('gemini_zero_shot_metrics_1500.csv', index=False)

print("Evaluation metrics saved to 'gemini_zero_shot_metrics_1500.csv'.")


level_0 - F1 Score: 0.0, Precision: 1.0, Recall: 0.0, Accuracy: 0.0


AttributeError: 'float' object has no attribute 'strip'