In [2]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("/content/impression_300_llm.csv")

# Split the dataset into 300 samples for training and 30 for testing
train_data, test_data = train_test_split(df, test_size=30, random_state=42)


In [6]:
# Prepare the input texts and labels for the training data
train_texts = [
    f"Report Name: {row['Report Name']}, History: {row['History']}, Observation: {row['Observation']}. Impression: "
    for _, row in train_data.iterrows()
]
train_labels = train_data['Impression'].tolist()

# Prepare the input texts and labels for the test data
test_texts = [
    f"Report Name: {row['Report Name']}, History: {row['History']}, Observation: {row['Observation']}. Impression: "
    for _, row in test_data.iterrows()
]
test_labels = test_data['Impression'].tolist()


In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", torch_dtype=torch.bfloat16)

# Tokenize the input texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
train_labels_encodings = tokenizer(train_labels, truncation=True, padding=True, return_tensors="pt")

# Create a PyTorch dataset
class ReportDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels['input_ids'][idx],
        }

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = ReportDataset(train_encodings, train_labels_encodings)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,             # Number of epochs to train
    per_device_train_batch_size=4,  # Batch size for training
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


KeyboardInterrupt: 

In [None]:
from datasets import load_metric

# Tokenize the test data
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")

# Generate impressions for the test data
outputs = model.generate(test_encodings['input_ids'])

# Decode the generated outputs
generated_impressions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Compute Perplexity (requires calculating the loss on the test set)
def compute_perplexity(model, test_texts, tokenizer):
    total_loss = 0.0
    num_batches = len(test_texts)

    for text in test_texts:
        input_ids = tokenizer(text, return_tensors="pt")['input_ids']
        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss
            total_loss += loss.item()

    perplexity = torch.exp(torch.tensor(total_loss / num_batches))
    return perplexity

perplexity = compute_perplexity(model, test_texts, tokenizer)
print(f"Perplexity: {perplexity}")

# Compute ROUGE score by comparing generated impressions with the ground truth impressions
rouge = load_metric("rouge")

# Prepare the predictions and references
predictions = [generated for generated in generated_impressions]
references = [truth for truth in test_labels]

# Compute ROUGE score
rouge_score = rouge.compute(predictions=predictions, references=references, rouge_types=["rouge1", "rouge2", "rougeL"])

print(f"ROUGE scores: {rouge_score}")


In [10]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [11]:
# Load the dataset
df = pd.read_csv('/content/impression_300_llm.csv')  # Update the path to your dataset
reports = df['History'] + ' ' + df['Observation']  # Combine History and Observation for analysis


In [14]:
# Initialize NLTK tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize and clean text
    words = nltk.word_tokenize(text.lower())
    # Remove stop words, apply stemming and lemmatization
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

# Apply preprocessing to all reports
processed_reports = [preprocess_text(report) for report in reports]


In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Function to get embeddings
def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Use mean pooling for embeddings

# Get embeddings for the processed reports
embeddings = get_embeddings(processed_reports)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Create a mapping of words to their embeddings
word_embeddings = {}
for report in processed_reports:
    words = report.split()
    for word in words:
        if word not in word_embeddings:
            word_embeddings[word] = []
        word_embeddings[word].append(embeddings[processed_reports.index(report)])

# Average the embeddings for each word
for word in word_embeddings:
    word_embeddings[word] = torch.mean(torch.tensor(word_embeddings[word]), dim=0).numpy()

# Calculate cosine similarity between all pairs of words
word_list = list(word_embeddings.keys())
similarities = []

for i in range(len(word_list)):
    for j in range(i + 1, len(word_list)):
        similarity = cosine_similarity([word_embeddings[word_list[i]]], [word_embeddings[word_list[j]]])[0][0]
        similarities.append((word_list[i], word_list[j], similarity))

# Sort by similarity and get top 100 pairs
top_pairs = sorted(similarities, key=lambda x: x[2], reverse=True)[:100]

# Display the top pairs
for pair in top_pairs:
    print(f"Words: {pair[0]}, {pair[1]} - Similarity: {pair[2]:.4f}")


In [None]:
import pandas as pd

# Create a DataFrame for the top word pairs
top_pairs_df = pd.DataFrame(top_pairs, columns=['Word 1', 'Word 2', 'Similarity'])

# Display the DataFrame
print(top_pairs_df.head())


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Set up the figure
plt.figure(figsize=(15, 8))
# Sort by similarity for better visualization
sorted_df = top_pairs_df.sort_values(by='Similarity', ascending=False)

# Create a horizontal bar plot
plt.barh(np.arange(len(sorted_df)), sorted_df['Similarity'], color='skyblue')
plt.yticks(np.arange(len(sorted_df)), [f"{row['Word 1']} - {row['Word 2']}" for _, row in sorted_df.iterrows()])
plt.xlabel('Cosine Similarity')
plt.title('Top 100 Word Pairs Based on Similarity')
plt.gca().invert_yaxis()  # Invert y-axis for better reading
plt.show()


In [None]:
import plotly.express as px

# Create a scatter plot
fig = px.scatter(
    top_pairs_df,
    x='Word 1',
    y='Word 2',
    size='Similarity',
    color='Similarity',
    hover_name='Similarity',
    title='Top 100 Word Pairs Based on Similarity',
    labels={'Word 1': 'Word 1', 'Word 2': 'Word 2'},
)

# Update layout for better readability
fig.update_traces(marker=dict(sizemode='diameter', opacity=0.5, line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(title_font_size=20, title_x=0.5)
fig.show()


In [None]:
import plotly.graph_objects as go

# Create edge lists for the network graph
edge_x = []
edge_y = []
for index, row in top_pairs_df.iterrows():
    edge_x.append(row['Word 1'])
    edge_x.append(row['Word 2'])
    edge_y.append(row['Similarity'])
    edge_y.append(row['Similarity'])

# Create the network graph
fig = go.Figure()

# Add edges
for index, row in top_pairs_df.iterrows():
    fig.add_trace(go.Scatter(
        x=[row['Word 1'], row['Word 2']],
        y=[row['Similarity'], row['Similarity']],
        mode='lines+markers',
        line=dict(width=2, color='blue'),
        marker=dict(size=10)
    ))

# Customize layout
fig.update_layout(
    title='Word Pair Similarity Network',
    showlegend=False,
    xaxis_title='Words',
    yaxis_title='Similarity',
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(showgrid=False, zeroline=False),
)

# Display the figure
fig.show()
