<a href="https://colab.research.google.com/github/prakalyask/AP21110010561_5C_Network/blob/main/5C_Comms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **5C-Network LLM Finetuning - Round 2**

#**Cleaning the dataset**
*   Removing NULL Values
*   Removing Duplicate values


In [1]:
import pandas as pd

In [2]:
# Load the CSV file
file_path = 'impression_300_llm.csv'
df = pd.read_csv(file_path)

# Checking for missing values and duplicates
missing_values = df.isnull().sum()
duplicates = df.duplicated().sum()

# Normalizing text by converting to lowercase and removing special characters
df_cleaned = df.copy()

# Convert to lowercase
df_cleaned['History'] = df_cleaned['History'].str.lower()
df_cleaned['Observation'] = df_cleaned['Observation'].str.lower()
df_cleaned['Impression'] = df_cleaned['Impression'].str.lower()

# Remove special characters (only keeping alphanumeric and spaces)
df_cleaned['History'] = df_cleaned['History'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
df_cleaned['Observation'] = df_cleaned['Observation'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
df_cleaned['Impression'] = df_cleaned['Impression'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
secret_token = "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"

# Remove duplicate rows
df_cleaned = df_cleaned.drop_duplicates()

# Save the cleaned dataset to a new CSV file
cleaned_file_path = 'cleaned_data.csv'
df_cleaned.to_csv(cleaned_file_path, index=False)

# Display missing values and duplicates count
print("Missing Values:\n", missing_values)
print("Duplicates Found:", duplicates)
print("Dataset after cleaning:\n", df_cleaned.head())


Missing Values:
 Report Name    0
History        0
Observation    0
Impression     0
dtype: int64
Duplicates Found: 46
Dataset after cleaning:
                                        Report Name  \
0                                  MRI Brain Plain   
1                    MRI Orbits Plain and Contrast   
2                     MRI Brain Plain and Contrast   
3                     MRI Brain Plain and Contrast   
4  CT Abdomen & Pelvis Plain and Contrast - Female   

                          History  \
0                             loc   
1               document attached   
2               document attached   
3                        headache   
4  pain in rt sided upper abdomen   

                                         Observation  \
0  possible minimal volume loss of right hippocam...   
1  the globe is normal shape the uveoscleral thic...   
2  no evidence of acute infarct hemorrhage or spa...   
3  limited study due to phase encoded pulsation a...   
4  the liver is normal sized

#**Data Preparation for Fine-Tuning**

In [3]:
# !pip install datasets
# !pip install transformers

In [4]:
# import nltk

In [5]:
# nltk.download('punkt')

In [6]:
from datasets import Dataset

# Split dataset: first 300 rows for training and last 30 rows for evaluation
train_data = df_cleaned.iloc[:300]
eval_data = df_cleaned.iloc[300:]

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)

# Prepare columns for fine-tuning input (Report Name, History, Observation) and target (Impression)
def preprocess_function(examples):
    inputs = examples['Report Name'] + ' ' + examples['History'] + ' ' + examples['Observation']
    targets = examples['Impression']
    return {'input_text': inputs, 'target_text': targets}


In [7]:
# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function)
eval_dataset = eval_dataset.map(preprocess_function)

Map:   0%|          | 0/280 [00:00<?, ? examples/s]

In [8]:
from huggingface_hub import login

# Use your Hugging Face token
login(token=secret_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [9]:
# Tokenize inputs and targets using a tokenizer (e.g., Hugging Face's AutoTokenizer)
# Load model directly
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", torch_dtype=torch.float16)
tokenized_train = train_dataset.map(lambda x: tokenizer(x['input_text'], truncation=True, padding=True), batched=True)
tokenized_eval = eval_dataset.map(lambda x: tokenizer(x['input_text'], truncation=True, padding=True), batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Map:   0%|          | 0/280 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


#**Fine-Tuning**

In [10]:
import torch.nn.functional as F
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import Dataset
from tqdm import tqdm

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

cpu


In [12]:
# Enabling gradient checkpointing to reduce memory usage
model.gradient_checkpointing_enable()

In [13]:
#Training Parameters

num_epochs = 3
learning_rate = 5e-5
batch_size = 1
gradient_accumulation_steps = 8

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
model.train()  # Set the model to training mode
for epoch in range(num_epochs):
    epoch_loss = 0  # Track loss for the epoch
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for i in tqdm(range(0, len(train_dataset), batch_size)):
        # Prepare a batch
        batch = train_dataset[i:i + batch_size]

        # Tokenize input and target, ensuring both are padded to the same length
        inputs = tokenizer(batch['input_text'], padding='max_length', truncation=True, return_tensors='pt', max_length=512).to(device)
        labels = tokenizer(batch['target_text'], padding='max_length', truncation=True, return_tensors='pt', max_length=512).to(device)

        # Make sure to replace the `input_ids` of labels with -100 where padding occurs to avoid computation on padding tokens
        labels['input_ids'][labels['input_ids'] == tokenizer.pad_token_id] = -100

        # Forward pass
        outputs = model(**inputs, labels=labels['input_ids'])
        loss = outputs.loss / gradient_accumulation_steps  # Normalize loss by accumulation steps

        # Backward pass
        loss.backward()

        # Update parameters
        if (i // batch_size + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

    print(f"Loss: {loss.item()}")


Epoch 1/3


  0%|          | 0/280 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


#**Training failed due to machine limitations**

#**Evaluation**

# **Compute ROUGE score**

In [None]:
!pip install evaluate


In [None]:
from evaluate import load
model.eval()  # Set the model to evaluation mode
predictions = []
with torch.no_grad():
    for i in range(len(eval_dataset)):
        # Prepare a single input
        input_text = eval_dataset[i]['input_text']
        inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to(device)

        # Generate the output
        generated_ids = model.generate(**inputs)
        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        predictions.append(generated_text)

In [None]:
rouge = load("rouge")  # Load ROUGE metric
references = eval_dataset['target_text']

# Compute the ROUGE scores
rouge_scores = rouge.compute(predictions=predictions, references=references)

# Print ROUGE Scores
print(f"ROUGE Scores: {rouge_scores}")

#**Visualizations**

######Loss Over Epochs Visualization

In [None]:
import matplotlib.pyplot as plt

losses = []  # Store loss values

# Step 4: Training Loop
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0  # Track loss for the epoch
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for i in tqdm(range(0, len(train_dataset), batch_size)):
        # Prepare a batch
        batch = train_dataset[i:i + batch_size]
        inputs = tokenizer(batch['input_text'], padding=True, truncation=True, return_tensors='pt').to(device)
        labels = tokenizer(batch['target_text'], padding=True, truncation=True, return_tensors='pt').to(device)

        # Forward pass
        outputs = model(**inputs, labels=labels['input_ids'])
        loss = outputs.loss / gradient_accumulation_steps

        # Backward pass
        loss.backward()

        # Update parameters
        if (i // batch_size + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / (len(train_dataset) // batch_size)
    losses.append(avg_loss)
    print(f"Loss: {avg_loss}")

# Plotting the loss
plt.figure(figsize=(10, 5))
plt.plot(losses, marker='o')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Average Loss')
plt.grid()
plt.show()

######ROUGE Score Visualization

In [None]:
# Compute ROUGE score
rouge = load_metric("rouge")
references = eval_dataset['target_text']

# Collect ROUGE scores
rouge_scores = rouge.compute(predictions=predictions, references=references)

# Prepare data for visualization
rouge_keys = list(rouge_scores.keys())
rouge_values = [rouge_scores[k]['fmeasure'] for k in rouge_keys]

# Plotting ROUGE scores
plt.figure(figsize=(10, 5))
plt.bar(rouge_keys, rouge_values, color='skyblue')
plt.title('ROUGE Scores')
plt.ylabel('F1 Score')
plt.xlabel('ROUGE Metrics')
plt.grid()
plt.show()

######Word Pair Similarity Visualization

In [None]:
import networkx as nx

# Assuming top_100_pairs is a list of tuples (word1, word2, similarity)
G = nx.Graph()

for word1, word2, similarity in top_100_pairs:
    G.add_edge(word1, word2, weight=similarity)

# Plotting the word pairs
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_color="skyblue", node_size=3000, font_size=10, edge_color="gray")
plt.title('Top 100 Word Pair Similarities')
plt.show()