In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2
import os

# Create a 'visualizations' folder if it doesn't exist
os.makedirs('visualizations', exist_ok=True)

# Load the JSON files
with open('medical_features_finetuned.json', 'r') as f:
    finetuned_data = json.load(f)

with open('medical_features_baseline.json', 'r') as f:
    baseline_data = json.load(f)

# Function to extract words and activations
def extract_words_activations(data, word_type):
    words = []
    activations = []
    for key in data:
        for item in data[key][word_type]:
            words.append(item['word'])
            activations.append(item['activation'])
    return words, activations

# Extract data
finetuned_medical_words, finetuned_medical_activations = extract_words_activations(finetuned_data, 'medical_words')
finetuned_top_words, finetuned_top_activations = extract_words_activations(finetuned_data, 'top_words')
baseline_medical_words, baseline_medical_activations = extract_words_activations(baseline_data, 'medical_words')
baseline_top_words, baseline_top_activations = extract_words_activations(baseline_data, 'top_words')

# Create DataFrames
df_finetuned_medical = pd.DataFrame({'word': finetuned_medical_words, 'activation': finetuned_medical_activations, 'type': 'finetuned'})
df_baseline_medical = pd.DataFrame({'word': baseline_medical_words, 'activation': baseline_medical_activations, 'type': 'baseline'})
df_medical = pd.concat([df_finetuned_medical, df_baseline_medical])

# 1. Activation value distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=df_medical, x='activation', hue='type', kde=True)
plt.xlabel('Activation')
plt.ylabel('Count')
plt.savefig('visualizations/activation_distribution.png')
plt.close()

# 2. Top 10 medical words by activation
top_10_medical = df_medical.groupby('word')['activation'].mean().nlargest(10).reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(data=top_10_medical, x='word', y='activation')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('visualizations/top_10_medical_words.png')
plt.close()

# 3. Comparison of unique words
unique_finetuned = set(finetuned_medical_words)
unique_baseline = set(baseline_medical_words)
common_words = unique_finetuned.intersection(unique_baseline)
only_finetuned = unique_finetuned - unique_baseline
only_baseline = unique_baseline - unique_finetuned

plt.figure(figsize=(10, 6))
venn2([unique_finetuned, unique_baseline], ('Finetuned', 'Baseline'))
plt.savefig('visualizations/unique_words_comparison.png')
plt.close()

print(f"Number of common words: {len(common_words)}")
print(f"Words only in finetuned: {len(only_finetuned)}")
print(f"Words only in baseline: {len(only_baseline)}")

# 4. Number of medical matches
def count_medical_matches(data):
    counts = {}
    for key in data:
        num_matches = len(data[key]['medical_words'])
        counts[num_matches] = counts.get(num_matches, 0) + 1
    return counts

finetuned_matches = count_medical_matches(finetuned_data)
baseline_matches = count_medical_matches(baseline_data)

plt.figure(figsize=(12, 6))
x = list(set(list(finetuned_matches.keys()) + list(baseline_matches.keys())))
plt.bar([i-0.2 for i in x], [finetuned_matches.get(i, 0) for i in x], width=0.4, label='Finetuned', align='center')
plt.bar([i+0.2 for i in x], [baseline_matches.get(i, 0) for i in x], width=0.4, label='Baseline', align='center')
plt.xlabel('Number of Medical Matches')
plt.ylabel('Count')
plt.legend()
plt.xticks(x)
plt.savefig('visualizations/medical_matches_comparison.png')
plt.close()

print("Analysis complete. Check the 'visualizations' folder for the generated PNG files.")

Number of common words: 101
Words only in finetuned: 9
Words only in baseline: 53
Analysis complete. Check the 'visualizations' folder for the generated PNG files.


In [4]:
import numpy as np

In [7]:
print(f"Finetuned Activation Value Mean: {round(np.mean(finetuned_medical_activations), 3)}")
print(f"Baseline Activation Value Mean: {round(np.mean(baseline_medical_activations), 3)}")

Finetuned Activation Value Mean: 0.794
Baseline Activation Value Mean: 2.166


In [9]:
print(f"Finetuned Activation Value Variance: {round(np.var(finetuned_medical_activations), 3)}")
print(f"Baseline Activation Value Variance: {round(np.var(baseline_medical_activations), 3)}")

Finetuned Activation Value Variance: 0.018
Baseline Activation Value Variance: 0.633


In [2]:
only_finetuned

{'stroke',
 'Ġaspirin',
 'Ġbowel',
 'Ġmedically',
 'Ġnursery',
 'Ġquarantine',
 'Ġstroke',
 'Ġstrokes',
 'Ġtransplant'}

In [3]:
only_baseline

{'Blood',
 'Doctor',
 'Heart',
 'Medical',
 'blood',
 'brain',
 'clinical',
 'hearted',
 'hospital',
 'immune',
 'medical',
 'ĠAddiction',
 'ĠAlzheimer',
 'ĠAnxiety',
 'ĠBlood',
 'ĠClinical',
 'ĠGenetic',
 'ĠHeart',
 'ĠHearth',
 'ĠHearts',
 'ĠMedical',
 'ĠNurse',
 'ĠPTSD',
 'ĠPediatrics',
 'ĠPhysicians',
 'ĠPsychiatry',
 'Ġaddiction',
 'Ġantidepressant',
 'Ġantidepressants',
 'Ġanxiety',
 'Ġautoimmune',
 'Ġbiomedical',
 'Ġblood',
 'Ġbloodstream',
 'Ġbreastfeeding',
 'Ġchronically',
 'Ġclinic',
 'Ġclinical',
 'Ġclinically',
 'Ġclinicians',
 'Ġclinics',
 'Ġheartbeat',
 'Ġhearts',
 'Ġhospital',
 'Ġhospitalized',
 'Ġhospitals',
 'Ġinsulin',
 'Ġnurse',
 'Ġnurses',
 'Ġpsychiatry',
 'Ġsteroid',
 'Ġsteroids',
 'Ġultrasound'}

In [4]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2
import os

# Create a 'visualizations' folder if it doesn't exist
os.makedirs('visualizations', exist_ok=True)

# Load the JSON files
with open('medical_features_finetuned.json', 'r') as f:
    finetuned_data = json.load(f)

with open('medical_features_baseline.json', 'r') as f:
    baseline_data = json.load(f)

# Function to extract words and activations
def extract_words_activations(data, word_type):
    words = []
    activations = []
    for key in data:
        for item in data[key][word_type]:
            words.append(item['word'])
            activations.append(item['activation'])
    return words, activations

# Extract data
finetuned_medical_words, finetuned_medical_activations = extract_words_activations(finetuned_data, 'medical_words')
finetuned_top_words, finetuned_top_activations = extract_words_activations(finetuned_data, 'top_words')
baseline_medical_words, baseline_medical_activations = extract_words_activations(baseline_data, 'medical_words')
baseline_top_words, baseline_top_activations = extract_words_activations(baseline_data, 'top_words')

# Create DataFrames
df_finetuned_medical = pd.DataFrame({'word': finetuned_medical_words, 'activation': finetuned_medical_activations, 'type': 'finetuned'})
df_baseline_medical = pd.DataFrame({'word': baseline_medical_words, 'activation': baseline_medical_activations, 'type': 'baseline'})
df_medical = pd.concat([df_finetuned_medical, df_baseline_medical])

# ... (previous visualization code remains the same)

# New function for unique words analysis and visualization
def analyze_unique_words(finetuned_words, baseline_words, title, filename):
    unique_finetuned = set(finetuned_words)
    unique_baseline = set(baseline_words)
    common_words = unique_finetuned.intersection(unique_baseline)
    only_finetuned = unique_finetuned - unique_baseline
    only_baseline = unique_baseline - unique_finetuned

    plt.figure(figsize=(10, 6))
    venn2([unique_finetuned, unique_baseline], ('Finetuned', 'Baseline'))
    plt.title(title)
    plt.savefig(f'visualizations/{filename}.png')
    plt.close()

    print(f"\n{title}:")
    print(f"Number of common words: {len(common_words)}")
    print(f"Words only in finetuned: {len(only_finetuned)}")
    print(f"Words only in baseline: {len(only_baseline)}")

    return common_words, only_finetuned, only_baseline

# Analyze unique words for both medical and top words
medical_common, medical_only_finetuned, medical_only_baseline = analyze_unique_words(
    finetuned_medical_words, baseline_medical_words,
    "Comparison of Unique Medical Words", "unique_medical_words_comparison"
)

top_common, top_only_finetuned, top_only_baseline = analyze_unique_words(
    finetuned_top_words, baseline_top_words,
    "Comparison of Unique Top Words", "unique_top_words_comparison"
)

# Additional analysis: Top 10 unique words by activation
def top_unique_words(words, activations, unique_set, n=10):
    df = pd.DataFrame({'word': words, 'activation': activations})
    df = df[df['word'].isin(unique_set)].nlargest(n, 'activation')
    return df

# Visualize top unique words
def plot_top_unique_words(df, title, filename):
    plt.figure(figsize=(12, 6))
    sns.barplot(data=df, x='word', y='activation')
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'visualizations/{filename}.png')
    plt.close()

# Top unique medical words
top_unique_finetuned_medical = top_unique_words(finetuned_medical_words, finetuned_medical_activations, medical_only_finetuned)
top_unique_baseline_medical = top_unique_words(baseline_medical_words, baseline_medical_activations, medical_only_baseline)

plot_top_unique_words(top_unique_finetuned_medical, "Top 10 Unique Medical Words in Finetuned Model", "top_unique_finetuned_medical")
plot_top_unique_words(top_unique_baseline_medical, "Top 10 Unique Medical Words in Baseline Model", "top_unique_baseline_medical")

# Top unique top words
top_unique_finetuned_top = top_unique_words(finetuned_top_words, finetuned_top_activations, top_only_finetuned)
top_unique_baseline_top = top_unique_words(baseline_top_words, baseline_top_activations, top_only_baseline)

plot_top_unique_words(top_unique_finetuned_top, "Top 10 Unique Top Words in Finetuned Model", "top_unique_finetuned_top")
plot_top_unique_words(top_unique_baseline_top, "Top 10 Unique Top Words in Baseline Model", "top_unique_baseline_top")

print("\nAnalysis complete. Check the 'visualizations' folder for the generated PNG files.")


Comparison of Unique Medical Words:
Number of common words: 101
Words only in finetuned: 9
Words only in baseline: 53

Comparison of Unique Top Words:
Number of common words: 199
Words only in finetuned: 400
Words only in baseline: 394

Analysis complete. Check the 'visualizations' folder for the generated PNG files.


In [5]:
print(top_unique_finetuned_top)

          word  activation
1151     plate    1.194993
220      Ġmell    1.168636
650     Ġcytok    1.168211
1152   Ġbreast    1.156664
300     Ġgland    1.139597
200     ĠLeone    1.137223
850        oma    1.130685
301    Ġglands    1.092097
770   Ġbilling    1.068011
420     Ġgamma    1.066886


In [5]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2
import os
from collections import Counter

# Create a 'visualizations' folder if it doesn't exist
os.makedirs('visualizations', exist_ok=True)

# Load the JSON files
with open('medical_features_finetuned.json', 'r') as f:
    finetuned_data = json.load(f)

with open('medical_features_baseline.json', 'r') as f:
    baseline_data = json.load(f)

# Function to extract words and activations
def extract_words_activations(data, word_type):
    words = []
    activations = []
    for key in data:
        for item in data[key][word_type]:
            words.append(item['word'])
            activations.append(item['activation'])
    return words, activations

# Extract data
finetuned_medical_words, finetuned_medical_activations = extract_words_activations(finetuned_data, 'medical_words')
finetuned_top_words, finetuned_top_activations = extract_words_activations(finetuned_data, 'top_words')
baseline_medical_words, baseline_medical_activations = extract_words_activations(baseline_data, 'medical_words')
baseline_top_words, baseline_top_activations = extract_words_activations(baseline_data, 'top_words')

# Create DataFrames
df_finetuned_medical = pd.DataFrame({'word': finetuned_medical_words, 'activation': finetuned_medical_activations, 'type': 'finetuned'})
df_baseline_medical = pd.DataFrame({'word': baseline_medical_words, 'activation': baseline_medical_activations, 'type': 'baseline'})
df_medical = pd.concat([df_finetuned_medical, df_baseline_medical])

# ... (previous code remains the same)

# New function for analyzing and visualizing word frequencies
def analyze_word_frequencies(words, title, filename, top_n=20):
    word_counts = Counter(words)
    top_words = word_counts.most_common(top_n)

    df = pd.DataFrame(top_words, columns=['word', 'count'])

    plt.figure(figsize=(12, 6))
    sns.barplot(data=df, x='word', y='count')
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'visualizations/{filename}.png')
    plt.close()

    return df

# Analyze word frequencies
finetuned_medical_freq = analyze_word_frequencies(finetuned_medical_words,
                                                  "Top 20 Most Frequent Medical Words (Finetuned)",
                                                  "finetuned_medical_freq")
baseline_medical_freq = analyze_word_frequencies(baseline_medical_words,
                                                 "Top 20 Most Frequent Medical Words (Baseline)",
                                                 "baseline_medical_freq")
finetuned_top_freq = analyze_word_frequencies(finetuned_top_words,
                                              "Top 20 Most Frequent Top Words (Finetuned)",
                                              "finetuned_top_freq")
baseline_top_freq = analyze_word_frequencies(baseline_top_words,
                                             "Top 20 Most Frequent Top Words (Baseline)",
                                             "baseline_top_freq")

# Function for analyzing and visualizing highest activating words
def analyze_highest_activations(words, activations, title, filename, top_n=20):
    df = pd.DataFrame({'word': words, 'activation': activations})
    df = df.groupby('word')['activation'].mean().nlargest(top_n).reset_index()

    plt.figure(figsize=(12, 6))
    sns.barplot(data=df, x='word', y='activation')
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'visualizations/{filename}.png')
    plt.close()

    return df

# Analyze highest activating words
finetuned_medical_high_act = analyze_highest_activations(finetuned_medical_words, finetuned_medical_activations,
                                                         "Top 20 Highest Activating Medical Words (Finetuned)",
                                                         "finetuned_medical_high_act")
baseline_medical_high_act = analyze_highest_activations(baseline_medical_words, baseline_medical_activations,
                                                        "Top 20 Highest Activating Medical Words (Baseline)",
                                                        "baseline_medical_high_act")
finetuned_top_high_act = analyze_highest_activations(finetuned_top_words, finetuned_top_activations,
                                                     "Top 20 Highest Activating Top Words (Finetuned)",
                                                     "finetuned_top_high_act")
baseline_top_high_act = analyze_highest_activations(baseline_top_words, baseline_top_activations,
                                                    "Top 20 Highest Activating Top Words (Baseline)",
                                                    "baseline_top_high_act")

# Print summary statistics
print("\nMost Frequent Words Summary:")
print("\nFinetuned Medical Words:")
print(finetuned_medical_freq.head().to_string(index=False))
print("\nBaseline Medical Words:")
print(baseline_medical_freq.head().to_string(index=False))
print("\nFinetuned Top Words:")
print(finetuned_top_freq.head().to_string(index=False))
print("\nBaseline Top Words:")
print(baseline_top_freq.head().to_string(index=False))

print("\nHighest Activating Words Summary:")
print("\nFinetuned Medical Words:")
print(finetuned_medical_high_act.head().to_string(index=False))
print("\nBaseline Medical Words:")
print(baseline_medical_high_act.head().to_string(index=False))
print("\nFinetuned Top Words:")
print(finetuned_top_high_act.head().to_string(index=False))
print("\nBaseline Top Words:")
print(baseline_top_high_act.head().to_string(index=False))

print("\nAnalysis complete. Check the 'visualizations' folder for the generated PNG files.")


Most Frequent Words Summary:

Finetuned Medical Words:
     word  count
ĠPatients     18
  Ġtumors     15
Ġpatients     14
  Ġcancer     13
 ĠSurgery     11

Baseline Medical Words:
      word  count
Ġillnesses      7
   Doctors      7
Ġphysician      7
  Ġcancers      7
  Ġsurgeon      7

Finetuned Top Words:
     word  count
ĠPatients     18
  Ġtumors     15
Ġpatients     14
  Ġcancer     13
 ĠSurgery     11

Baseline Top Words:
      word  count
Ġillnesses      7
   Doctors      7
Ġphysician      7
  Ġcancers      7
  Ġsurgeon      7

Highest Activating Words Summary:

Finetuned Medical Words:
          word  activation
       Ġstroke    1.205280
       Ġbreast    1.119046
      ĠChronic    1.089770
      Ġchronic    1.064143
Ġpsychiatrists    1.008942

Baseline Medical Words:
       word  activation
  Ġsteroids       4.343
     Ġfever       4.140
    Ġbreast       3.887
Ġdepression       3.761
      Ġlung       3.756

Finetuned Top Words:
     word  activation
   Ġcytok    1.21607

In [7]:
unique_baseline

{'Blood',
 'Bone',
 'Brain',
 'Doctor',
 'Doctors',
 'Heart',
 'MRI',
 'Medical',
 'blood',
 'blooded',
 'bone',
 'bones',
 'brain',
 'cancer',
 'clinical',
 'doctor',
 'heart',
 'hearted',
 'heartedly',
 'hospital',
 'immune',
 'medical',
 'otherapy',
 'patient',
 'stroke',
 'treatment',
 'ĠAddiction',
 'ĠAlzheimer',
 'ĠAnxiety',
 'ĠBlood',
 'ĠBloody',
 'ĠBone',
 'ĠBones',
 'ĠBrain',
 'ĠCancer',
 'ĠChronic',
 'ĠChronicle',
 'ĠChronicles',
 'ĠClinic',
 'ĠClinical',
 'ĠDepression',
 'ĠDiabetes',
 'ĠDisease',
 'ĠDiseases',
 'ĠDoctor',
 'ĠDoctors',
 'ĠFever',
 'ĠGenetic',
 'ĠGenetics',
 'ĠHeart',
 'ĠHearth',
 'ĠHearts',
 'ĠHospital',
 'ĠLung',
 'ĠMRI',
 'ĠMedical',
 'ĠMuscle',
 'ĠNurse',
 'ĠPTSD',
 'ĠPatient',
 'ĠPatients',
 'ĠPediatrics',
 'ĠPhysicians',
 'ĠPsychiatry',
 'ĠSurgery',
 'ĠSymptoms',
 'ĠTherapy',
 'ĠTreatment',
 'Ġacute',
 'Ġaddiction',
 'Ġantibiotic',
 'Ġantibiotics',
 'Ġantidepressant',
 'Ġantidepressants',
 'Ġanxiety',
 'Ġartery',
 'Ġarthritis',
 'Ġasteroid',
 'Ġasteroids

In [8]:
unique_finetuned

{'Bone',
 'Doctors',
 'MRI',
 'blooded',
 'bone',
 'bones',
 'brain',
 'cancer',
 'doctor',
 'heart',
 'hospital',
 'immune',
 'otherapy',
 'patient',
 'stroke',
 'treatment',
 'ĠBloody',
 'ĠBone',
 'ĠBones',
 'ĠCancer',
 'ĠChronic',
 'ĠClinic',
 'ĠDepression',
 'ĠDiabetes',
 'ĠDisease',
 'ĠDiseases',
 'ĠDoctor',
 'ĠDoctors',
 'ĠFever',
 'ĠGenetics',
 'ĠLung',
 'ĠMedical',
 'ĠPatient',
 'ĠPatients',
 'ĠPsychiatry',
 'ĠRadiation',
 'ĠSurgery',
 'ĠSymptoms',
 'ĠTherapy',
 'ĠTreatment',
 'Ġantibiotic',
 'Ġanxiety',
 'Ġartery',
 'Ġarthritis',
 'Ġaspirin',
 'Ġasthma',
 'Ġbone',
 'Ġbones',
 'Ġbrain',
 'Ġbrains',
 'Ġcancer',
 'Ġcancers',
 'Ġcardiovascular',
 'Ġchemotherapy',
 'Ġchronic',
 'Ġclinic',
 'Ġclinically',
 'Ġclinics',
 'Ġdepression',
 'Ġdiabetes',
 'Ġdiagnosis',
 'Ġdisease',
 'Ġdiseases',
 'Ġdoctor',
 'Ġdoctoral',
 'Ġdoctors',
 'Ġfever',
 'Ġfracture',
 'Ġfractures',
 'Ġgenetic',
 'Ġgenetically',
 'Ġgenetics',
 'Ġheart',
 'Ġhypertension',
 'Ġillness',
 'Ġimmune',
 'Ġinfection',
 'Ġin

In [9]:
# prompt: give me code to compare the output of gpt2 from hf and Sharathhebbar24/Med_GPT2 with medical prompts

# !pip install transformers

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Load the Hugging Face GPT-2 model
hf_model_name = "gpt2"
hf_tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
hf_model = AutoModelForCausalLM.from_pretrained(hf_model_name)
hf_generator = pipeline('text-generation', model=hf_model, tokenizer=hf_tokenizer)

# Load the Med_GPT2 model
med_model_name = "Sharathhebbar24/Med_GPT2"
med_tokenizer = AutoTokenizer.from_pretrained(med_model_name)
med_model = AutoModelForCausalLM.from_pretrained(med_model_name)
med_generator = pipeline('text-generation', model=med_model, tokenizer=med_tokenizer)

# Define medical prompts
medical_prompts = [
    "What are the symptoms of a heart attack?",
    "Describe the treatment options for diabetes.",
    "How does the human immune system work?",
    "Explain the process of blood clotting."
]

# Generate responses from both models
for prompt in medical_prompts:
    print(f"\nPrompt: {prompt}")

    hf_response = hf_generator(prompt, max_length=50, num_return_sequences=1)[0]['generated_text']
    print(f"Hugging Face GPT-2 Response: {hf_response}")

    med_response = med_generator(prompt, max_length=50, num_return_sequences=1)[0]['generated_text']
    print(f"Med_GPT2 Response: {med_response}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/932 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/249M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Prompt: What are the symptoms of a heart attack?


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Hugging Face GPT-2 Response: What are the symptoms of a heart attack?

A heart attack can only happen if the heart stops pumping and there is no pumping at all. If you cannot get your heart pump going quickly, try this:

If you are in a


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Med_GPT2 Response: What are the symptoms of a heart attack?
  A heart attack of the chest or a heart valve, which leads to pressure build-ups.

Episodes of sudden heart failure - which is usually when a high blood pressure is present.

Prompt: Describe the treatment options for diabetes.
Hugging Face GPT-2 Response: Describe the treatment options for diabetes. (a) Treatment Options for Diabetes.—There are 14 treatment options available to a person beginning to exercise before he or she is 65 years of age: DRS, DASH, or DDS.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Med_GPT2 Response: Describe the treatment options for diabetes.
assistant There are many treatment options for diabetes that can be prescribed by the dietician to aid in overall weight loss and reduce side effects. These include the use of insulin and ketone bodies as they prevent

Prompt: How does the human immune system work?
Hugging Face GPT-2 Response: How does the human immune system work?

Human immune systems are complex things, which means that you can use the term "brain immune system" to describe them in very general terms. (Don't like that?) These are different things altogether,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Med_GPT2 Response: How does the human immune system work?

The human immune system is composed of the organs of the immune system such as the blood, cells, and blood vessels that make up the nucleus. The immunoprotective cells are located in the tissues

Prompt: Explain the process of blood clotting.
Hugging Face GPT-2 Response: Explain the process of blood clotting. I had a problem with it, but it was caused by my own metabolism. I did the right thing. My thyroid glands came up and activated. It was a very exciting time."

Diet
Med_GPT2 Response: Explain the process of blood clotting.

Transformation.
Clotting which seems to be formed on blood is caused by a clotting event which has been caused by a normal clotting event. Blood clotting occurs in an absence


In [6]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2
import os
from collections import Counter
import numpy as np
from wordcloud import WordCloud

# ... (previous code remains the same)

# New visualizations

# 1. Word Cloud of Top Words
def create_word_cloud(words, activations, title, filename):
    word_freq = dict(zip(words, activations))
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.tight_layout(pad=0)
    plt.savefig(f'visualizations/{filename}.png')
    plt.close()

create_word_cloud(finetuned_top_words, finetuned_top_activations, "Word Cloud of Top Words (Finetuned)", "finetuned_wordcloud")
create_word_cloud(baseline_top_words, baseline_top_activations, "Word Cloud of Top Words (Baseline)", "baseline_wordcloud")

# 2. Activation Distribution Comparison
plt.figure(figsize=(12, 6))
sns.kdeplot(data=df_medical, x='activation', hue='type', shade=True)
plt.title('Activation Distribution Comparison')
plt.xlabel('Activation')
plt.ylabel('Density')
plt.savefig('visualizations/activation_distribution_comparison.png')
plt.close()

# 3. Scatter plot of word frequency vs activation
def plot_freq_vs_activation(words, activations, title, filename):
    word_freq = Counter(words)
    df = pd.DataFrame({'word': words, 'activation': activations, 'frequency': [word_freq[w] for w in words]})
    df = df.groupby('word').agg({'activation': 'mean', 'frequency': 'first'}).reset_index()

    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df, x='frequency', y='activation', alpha=0.6)
    plt.title(title)
    plt.xlabel('Word Frequency')
    plt.ylabel('Average Activation')
    for i, row in df.nlargest(10, 'activation').iterrows():
        plt.annotate(row['word'], (row['frequency'], row['activation']))
    plt.tight_layout()
    plt.savefig(f'visualizations/{filename}.png')
    plt.close()

plot_freq_vs_activation(finetuned_medical_words, finetuned_medical_activations,
                        "Word Frequency vs Activation (Finetuned Medical)", "finetuned_medical_freq_vs_act")
plot_freq_vs_activation(baseline_medical_words, baseline_medical_activations,
                        "Word Frequency vs Activation (Baseline Medical)", "baseline_medical_freq_vs_act")

# 4. Activation change for common words
def plot_activation_change(finetuned_words, finetuned_activations, baseline_words, baseline_activations, title, filename):
    finetuned_df = pd.DataFrame({'word': finetuned_words, 'activation': finetuned_activations}).groupby('word').mean()
    baseline_df = pd.DataFrame({'word': baseline_words, 'activation': baseline_activations}).groupby('word').mean()

    common_words = set(finetuned_df.index) & set(baseline_df.index)
    df = pd.DataFrame({
        'word': list(common_words),
        'finetuned': [finetuned_df.loc[w, 'activation'] for w in common_words],
        'baseline': [baseline_df.loc[w, 'activation'] for w in common_words]
    })
    df['change'] = df['finetuned'] - df['baseline']
    df = df.sort_values('change')

    plt.figure(figsize=(12, 8))
    plt.bar(range(len(df)), df['change'], align='center')
    plt.title(title)
    plt.xlabel('Words')
    plt.ylabel('Activation Change (Finetuned - Baseline)')
    plt.xticks(range(len(df)), df['word'], rotation=90)
    plt.tight_layout()
    plt.savefig(f'visualizations/{filename}.png')
    plt.close()

    return df

activation_change_df = plot_activation_change(finetuned_medical_words, finetuned_medical_activations,
                                              baseline_medical_words, baseline_medical_activations,
                                              "Activation Change for Common Medical Words", "medical_activation_change")

# 5. Top 10 words with largest activation increase and decrease
print("\nTop 10 medical words with largest activation increase:")
print(activation_change_df.nlargest(10, 'change')[['word', 'change']].to_string(index=False))

print("\nTop 10 medical words with largest activation decrease:")
print(activation_change_df.nsmallest(10, 'change')[['word', 'change']].to_string(index=False))
print("\nExtended analysis complete. Check the 'visualizations' folder for the new PNG files.")


`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=df_medical, x='activation', hue='type', shade=True)



Top 10 medical words with largest activation increase:
           word    change
     Ġremission  0.008371
        Ġartery -0.363113
    Ġoutpatient -0.402808
        ĠBloody -0.554491
       Ġnervous -0.705824
        blooded -0.770744
Ġcardiovascular -0.791744
     Ġarthritis -0.801086
         Ġacute -0.802886
         ĠFever -0.804283

Top 10 medical words with largest activation decrease:
        word    change
      Ġfever -3.349163
 Ġdepression -2.948618
       Ġlung -2.829106
     Ġbreast -2.767954
   Ġsymptoms -2.762981
    Ġchronic -2.570857
    ĠTherapy -2.454877
Ġgenetically -2.440295
 ĠDepression -2.398921
     Ġmuscle -2.389154

Extended analysis complete. Check the 'visualizations' folder for the new PNG files.


In [11]:
with open('top_activated_words_shraddha2.json', 'r') as f:
  finetuned_all_words = json.load(f)

In [12]:
with open('top_activated_words_with_activations.json', 'r') as f:
  baseline_all_words = json.load(f)

In [14]:
baseline_all_words

{'feature_0': {'top_words_activations': [{'word': 'ĠBrigade',
    'activation': 3.753},
   {'word': 'Ġbrigade', 'activation': 3.063},
   {'word': 'ĠBrig', 'activation': 2.409},
   {'word': 'Ġbrig', 'activation': 2.271},
   {'word': 'Ġbattalion', 'activation': 1.672},
   {'word': 'ĠBattalion', 'activation': 1.408},
   {'word': 'ĠRegiment', 'activation': 1.31},
   {'word': 'alions', 'activation': 1.161},
   {'word': 'ĠInfantry', 'activation': 1.155},
   {'word': 'ĠSquadron', 'activation': 1.101}]},
 'feature_1': {'top_words_activations': [{'word': 'existent',
    'activation': 0.841},
   {'word': 'ASC', 'activation': 0.756},
   {'word': 'arius', 'activation': 0.745},
   {'word': 'ouses', 'activation': 0.742},
   {'word': 'ucking', 'activation': 0.74},
   {'word': 'ensical', 'activation': 0.712},
   {'word': 'aryn', 'activation': 0.701},
   {'word': 'agin', 'activation': 0.699},
   {'word': 'arin', 'activation': 0.699},
   {'word': 'antes', 'activation': 0.686}]},
 'feature_2': {'top_word

In [17]:
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

finetuned_data = finetuned_all_words
baseline_data = baseline_all_words

# Function to extract activation values for finetuned data
def extract_finetuned_activations(data):
    activations = []
    for feature in data.values():
        activations.extend([word[1] for word in feature])
    return activations

# Function to extract activation values for baseline data
def extract_baseline_activations(data):
    activations = []
    for feature in data.values():
        activations.extend([word['activation'] for word in feature['top_words_activations']])
    return activations

# Extract activation values
finetuned_activations = extract_finetuned_activations(finetuned_data)
baseline_activations = extract_baseline_activations(baseline_data)

# Calculate mean and variance
finetuned_mean = np.mean(finetuned_activations)
finetuned_var = np.var(finetuned_activations)
baseline_mean = np.mean(baseline_activations)
baseline_var = np.var(baseline_activations)

print(f"Finetuned - Mean: {finetuned_mean:.4f}, Variance: {finetuned_var:.4f}")
print(f"Baseline - Mean: {baseline_mean:.4f}, Variance: {baseline_var:.4f}")

# Create the plot
plt.figure(figsize=(12, 6))

# Histogram and KDE for finetuned
sns.histplot(finetuned_activations, kde=True, color='blue', alpha=0.6, label='Finetuned')

# Histogram and KDE for baseline
sns.histplot(baseline_activations, kde=True, color='red', alpha=0.6, label='Baseline')

plt.xlabel('Activation Value')
plt.ylabel('Density')
plt.legend()

# Save the plot
plt.savefig('activation_distribution_comparison.png')
plt.close()

print("Analysis complete. Check 'activation_distribution_comparison.png' for the visualization.")

Finetuned - Mean: 0.6073, Variance: 0.0667
Baseline - Mean: 1.5848, Variance: 0.7608
Analysis complete. Check 'activation_distribution_comparison.png' for the visualization.
