In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import zipfile
import os
import json
import torch
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
# from sklearn.preprocessing import LabelEncoder



In [None]:

model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class EmailDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, label_encoder):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_encoder = label_encoder

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        email = self.data.iloc[index]
        text = email['subject'] + " " + email['body']
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(email['category'], dtype=torch.long)
        }




In [None]:
# Load and prepare datax
df = pd.read_csv('/content/focused_synthetic_email_dataseta.csv')

In [None]:
label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['category'])


In [None]:
dataset = EmailDataset(df, tokenizer, max_len=128, label_encoder=label_encoder)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [None]:
from tqdm import tqdm

for epoch in range(5):  # Number of epochs
    for batch in tqdm(dataloader, desc=f'Epoch {epoch + 1}/{5}'):
        optimizer.zero_grad()
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()


Epoch 1/5: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]
Epoch 2/5: 100%|██████████| 63/63 [00:28<00:00,  2.22it/s]
Epoch 3/5: 100%|██████████| 63/63 [00:27<00:00,  2.30it/s]
Epoch 4/5: 100%|██████████| 63/63 [00:28<00:00,  2.23it/s]
Epoch 5/5: 100%|██████████| 63/63 [00:27<00:00,  2.29it/s]


In [None]:
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json')

In [None]:
def classify_email(subject, body):
    text = subject + " " + body
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([predicted_class])[0]


In [None]:
result = classify_email("Technical consultation request", "We're developing a new AI-powered product and we're facing some challenges with our machine learning model. Would it be possible to arrange a consultation with one of your faculty experts?")
print(f"Classified as: {result}")

Classified as: Corporate inquiries


In [None]:
def classify_email(subject, body):
    text = subject + " " + body
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([predicted_class])[0]

# Assuming 'df' is your DataFrame with 'subject' and 'body' columns
# If 'category' column doesn't exist, create it
if 'category' not in df.columns:
    df['category'] = ''

# Apply the classification function to each row
df['category'] = df.apply(lambda row: classify_email(row['subject'], row['body']), axis=1)

# Print the first few rows to verify the results
print(df.head())

                                             subject  \
0  Query about visiting scholar program applicati...   
1  Query about peer review request application pa...   
2          Request for academic advising appointment   
3  Proposal for joint research on sabbatical oppo...   
4  Seeking co-author for research data sharing su...   

                                                body  \
0  In my investigation of grant application, I've...   
1  I'm conducting research on research data shari...   
2  There is a matter concerning course materials ...   
3  This issue regarding interdisciplinary collabo...   
4  I'm reaching out regarding my research into jo...   

                           category       email_type  
0  Academic collaboration inquiries   Research Query  
1  Academic collaboration inquiries   Research Query  
2                 Student inquiries  Sensitive Email  
3  Academic collaboration inquiries  Sensitive Email  
4  Academic collaboration inquiries   Research Quer

In [None]:
import zipfile
import os
from google.colab import files

# Define the folder to be zipped and the name of the zip file
folder_to_zip = '/content/fine_tuned_model'  # Replace with your folder path
zip_file_name = 'saved_models_v3.zip'        # Name of the resulting zip file (with .zip extension)

# Create a zip file of the folder
with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Walk through the folder and add all files to the zip file
    for root, dirs, files in os.walk(folder_to_zip):
        for file in files:
            file_path = os.path.join(root, file)
            # Add the file to the zip file, using the relative path
            zipf.write(file_path, os.path.relpath(file_path, os.path.dirname(folder_to_zip)))

# Download the zip file
# files.download(zip_file_name)


In [None]:
class EmailDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, label_encoder):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_encoder = label_encoder

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        email = self.data.iloc[index]
        text = str(email['subject']) + " " + str(email['body'])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        label = self.label_encoder.transform([str(email['email_type'])])[0]
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }


# Ensure all relevant columns are strings
df['subject'] = df['subject'].astype(str)
df['body'] = df['body'].astype(str)
df['email_type'] = df['email_type'].astype(str)


In [None]:
df['email_type_encoded'] = label_encoder.fit_transform(df['email_type'])

In [None]:

from transformers import BertTokenizer, BertForSequenceClassification, AdamW

from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Prepare datasets and dataloaders
max_len = 512  # Adjust based on your data
train_dataset = EmailDataset(train_df, tokenizer, max_len, label_encoder)
val_dataset = EmailDataset(val_df, tokenizer, max_len, label_encoder)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

# Training settings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the model
model = AutoModelForSequenceClassification.from_pretrained("/content/model_v3/fine_tuned_model")

# Load the tokenizer (if needed)
tokenizer = AutoTokenizer.from_pretrained("/content/model_v3/fine_tuned_model")

# Move the model to the appropriate device
# model = model.to(device)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
import torch
from torch.optim import AdamW

In [None]:
num_epochs = 15
learning_rate = 2e-5
weight_decay = 0.01
warmup_steps = 500

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * num_epochs)


In [None]:
model = model.to(device)

# num_epochs = 25
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc='Validation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            _, preds = torch.max(outputs.logits, dim=1)
            val_predictions.extend(preds.cpu().tolist())
            val_true_labels.extend(labels.cpu().tolist())

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Validation loss: {avg_val_loss}")

    # Print validation results
    print("\nValidation Results:")
    print(classification_report(val_true_labels, val_predictions, target_names=label_encoder.classes_))

Epoch 1/15: 100%|██████████| 50/50 [02:15<00:00,  2.71s/it]


Average training loss: 4.225255417823791


Validation: 100%|██████████| 13/13 [00:06<00:00,  1.87it/s]


Validation loss: 3.3536150455474854

Validation Results:
                     precision    recall  f1-score   support

General Information       0.35      0.42      0.38        64
     Research Query       0.38      0.35      0.36        71
    Sensitive Email       0.39      0.34      0.36        65

           accuracy                           0.37       200
          macro avg       0.37      0.37      0.37       200
       weighted avg       0.37      0.37      0.37       200



Epoch 2/15: 100%|██████████| 50/50 [02:29<00:00,  2.99s/it]


Average training loss: 1.4552327704429626


Validation: 100%|██████████| 13/13 [00:07<00:00,  1.76it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation loss: 1.096674946638254

Validation Results:
                     precision    recall  f1-score   support

General Information       0.00      0.00      0.00        64
     Research Query       0.83      0.07      0.13        71
    Sensitive Email       0.34      1.00      0.50        65

           accuracy                           0.35       200
          macro avg       0.39      0.36      0.21       200
       weighted avg       0.40      0.35      0.21       200



Epoch 3/15: 100%|██████████| 50/50 [02:25<00:00,  2.91s/it]


Average training loss: 1.0987763261795045


Validation: 100%|██████████| 13/13 [00:13<00:00,  1.07s/it]


Validation loss: 1.0784973731407752

Validation Results:
                     precision    recall  f1-score   support

General Information       0.67      0.09      0.16        64
     Research Query       0.48      0.90      0.63        71
    Sensitive Email       0.61      0.55      0.58        65

           accuracy                           0.53       200
          macro avg       0.59      0.52      0.46       200
       weighted avg       0.58      0.53      0.47       200



Epoch 4/15: 100%|██████████| 50/50 [02:29<00:00,  2.99s/it]


Average training loss: 1.022995204925537


Validation: 100%|██████████| 13/13 [00:07<00:00,  1.76it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation loss: 0.7651471266379724

Validation Results:
                     precision    recall  f1-score   support

General Information       0.00      0.00      0.00        64
     Research Query       0.91      1.00      0.95        71
    Sensitive Email       0.53      1.00      0.70        65

           accuracy                           0.68       200
          macro avg       0.48      0.67      0.55       200
       weighted avg       0.50      0.68      0.56       200



Epoch 5/15: 100%|██████████| 50/50 [02:29<00:00,  2.98s/it]


Average training loss: 0.34900826767086984


Validation: 100%|██████████| 13/13 [00:07<00:00,  1.80it/s]


Validation loss: 0.03591123796426333

Validation Results:
                     precision    recall  f1-score   support

General Information       1.00      1.00      1.00        64
     Research Query       1.00      1.00      1.00        71
    Sensitive Email       1.00      1.00      1.00        65

           accuracy                           1.00       200
          macro avg       1.00      1.00      1.00       200
       weighted avg       1.00      1.00      1.00       200



Epoch 6/15: 100%|██████████| 50/50 [02:28<00:00,  2.96s/it]


Average training loss: 0.01551324118860066


Validation: 100%|██████████| 13/13 [00:09<00:00,  1.39it/s]


Validation loss: 0.003952404412512596

Validation Results:
                     precision    recall  f1-score   support

General Information       1.00      1.00      1.00        64
     Research Query       1.00      1.00      1.00        71
    Sensitive Email       1.00      1.00      1.00        65

           accuracy                           1.00       200
          macro avg       1.00      1.00      1.00       200
       weighted avg       1.00      1.00      1.00       200



Epoch 7/15: 100%|██████████| 50/50 [02:27<00:00,  2.96s/it]


Average training loss: 0.0044652298791334035


Validation: 100%|██████████| 13/13 [00:06<00:00,  1.87it/s]


Validation loss: 0.0023350393077215324

Validation Results:
                     precision    recall  f1-score   support

General Information       1.00      1.00      1.00        64
     Research Query       1.00      1.00      1.00        71
    Sensitive Email       1.00      1.00      1.00        65

           accuracy                           1.00       200
          macro avg       1.00      1.00      1.00       200
       weighted avg       1.00      1.00      1.00       200



Epoch 8/15: 100%|██████████| 50/50 [02:31<00:00,  3.03s/it]


Average training loss: 0.0029016167717054488


Validation: 100%|██████████| 13/13 [00:07<00:00,  1.82it/s]


Validation loss: 0.0016154124616430355

Validation Results:
                     precision    recall  f1-score   support

General Information       1.00      1.00      1.00        64
     Research Query       1.00      1.00      1.00        71
    Sensitive Email       1.00      1.00      1.00        65

           accuracy                           1.00       200
          macro avg       1.00      1.00      1.00       200
       weighted avg       1.00      1.00      1.00       200



Epoch 9/15: 100%|██████████| 50/50 [02:29<00:00,  2.98s/it]


Average training loss: 0.0020889642019756138


Validation: 100%|██████████| 13/13 [00:09<00:00,  1.39it/s]


Validation loss: 0.0012081511802254962

Validation Results:
                     precision    recall  f1-score   support

General Information       1.00      1.00      1.00        64
     Research Query       1.00      1.00      1.00        71
    Sensitive Email       1.00      1.00      1.00        65

           accuracy                           1.00       200
          macro avg       1.00      1.00      1.00       200
       weighted avg       1.00      1.00      1.00       200



Epoch 10/15: 100%|██████████| 50/50 [02:30<00:00,  3.00s/it]


Average training loss: 0.0016323300125077366


Validation: 100%|██████████| 13/13 [00:07<00:00,  1.82it/s]


Validation loss: 0.0009371913799371284

Validation Results:
                     precision    recall  f1-score   support

General Information       1.00      1.00      1.00        64
     Research Query       1.00      1.00      1.00        71
    Sensitive Email       1.00      1.00      1.00        65

           accuracy                           1.00       200
          macro avg       1.00      1.00      1.00       200
       weighted avg       1.00      1.00      1.00       200



Epoch 11/15: 100%|██████████| 50/50 [02:34<00:00,  3.09s/it]


Average training loss: 0.0012689181766472758


Validation: 100%|██████████| 13/13 [00:06<00:00,  2.01it/s]


Validation loss: 0.0007666292054077181

Validation Results:
                     precision    recall  f1-score   support

General Information       1.00      1.00      1.00        64
     Research Query       1.00      1.00      1.00        71
    Sensitive Email       1.00      1.00      1.00        65

           accuracy                           1.00       200
          macro avg       1.00      1.00      1.00       200
       weighted avg       1.00      1.00      1.00       200



Epoch 12/15: 100%|██████████| 50/50 [02:30<00:00,  3.02s/it]


Average training loss: 0.0010743378871120512


Validation: 100%|██████████| 13/13 [00:07<00:00,  1.65it/s]


Validation loss: 0.0006739774029343748

Validation Results:
                     precision    recall  f1-score   support

General Information       1.00      1.00      1.00        64
     Research Query       1.00      1.00      1.00        71
    Sensitive Email       1.00      1.00      1.00        65

           accuracy                           1.00       200
          macro avg       1.00      1.00      1.00       200
       weighted avg       1.00      1.00      1.00       200



Epoch 13/15: 100%|██████████| 50/50 [02:32<00:00,  3.06s/it]


Average training loss: 0.0009657362871803343


Validation: 100%|██████████| 13/13 [00:07<00:00,  1.66it/s]


Validation loss: 0.0006159572065091477

Validation Results:
                     precision    recall  f1-score   support

General Information       1.00      1.00      1.00        64
     Research Query       1.00      1.00      1.00        71
    Sensitive Email       1.00      1.00      1.00        65

           accuracy                           1.00       200
          macro avg       1.00      1.00      1.00       200
       weighted avg       1.00      1.00      1.00       200



Epoch 14/15: 100%|██████████| 50/50 [02:30<00:00,  3.01s/it]


Average training loss: 0.0009109089092817158


Validation: 100%|██████████| 13/13 [00:06<00:00,  1.89it/s]


Validation loss: 0.0005840434101768411

Validation Results:
                     precision    recall  f1-score   support

General Information       1.00      1.00      1.00        64
     Research Query       1.00      1.00      1.00        71
    Sensitive Email       1.00      1.00      1.00        65

           accuracy                           1.00       200
          macro avg       1.00      1.00      1.00       200
       weighted avg       1.00      1.00      1.00       200



Epoch 15/15: 100%|██████████| 50/50 [02:33<00:00,  3.07s/it]


Average training loss: 0.00087360460893251


Validation: 100%|██████████| 13/13 [00:07<00:00,  1.83it/s]

Validation loss: 0.0005730402011137742

Validation Results:
                     precision    recall  f1-score   support

General Information       1.00      1.00      1.00        64
     Research Query       1.00      1.00      1.00        71
    Sensitive Email       1.00      1.00      1.00        65

           accuracy                           1.00       200
          macro avg       1.00      1.00      1.00       200
       weighted avg       1.00      1.00      1.00       200






In [None]:


# Function to classify email
def classify_email(subject, body):
    text = subject + " " + body
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True
    )
    input_ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0).to(device)
    attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

    return label_encoder.inverse_transform(preds.cpu().numpy())[0]

# Update the 'predicted_email_type' column in df
df['predicted_email_type'] = df.apply(lambda row: classify_email(row['subject'], row['body']), axis=1)

# Print the first few rows to verify the results
print("\nUpdated DataFrame:")
print(df[['subject', 'body', 'email_type', 'predicted_email_type']].head())



Updated DataFrame:
                                subject  \
0           Query about course syllabus   
1             Request for guest lecture   
2  Confidential: AI startup acquisition   
3    Confidential: Joint grant proposal   
4  AI Hackathon sponsorship opportunity   

                                                body           email_type  \
0  Could you send me the updated syllabus for the...  General Information   
1  We are organizing a lecture series on NLP. Wou...  General Information   
2  We are evaluating the acquisition of an AI sta...      Sensitive Email   
3  I’ve identified a promising AI grant. Given ou...      Sensitive Email   
4  We would like to sponsor an AI Hackathon at yo...  General Information   

  predicted_email_type  
0  General Information  
1  General Information  
2      Sensitive Email  
3      Sensitive Email  
4  General Information  


In [None]:
model.save_pretrained('./fine_tuned_model2')
tokenizer.save_pretrained('./fine_tuned_model2')

('./fine_tuned_model2/tokenizer_config.json',
 './fine_tuned_model2/special_tokens_map.json',
 './fine_tuned_model2/vocab.txt',
 './fine_tuned_model2/added_tokens.json',
 './fine_tuned_model2/tokenizer.json')

In [None]:
folder_to_zip = '/content/fine_tuned_model2'  # Replace with your folder path
zip_file_name = 'saved_models_v32.zip'        # Name of the resulting zip file (with .zip extension)

# Create a zip file of the folder
with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Walk through the folder and add all files to the zip file
    for root, dirs, files in os.walk(folder_to_zip):
        for file in files:
            file_path = os.path.join(root, file)
            # Add the file to the zip file, using the relative path
            zipf.write(file_path, os.path.relpath(file_path, os.path.dirname(folder_to_zip)))


In [None]:
def unzip_file(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Unzipped {zip_path} to {extract_to}")
    print("Contents:")
    for root, dirs, files in os.walk(extract_to):
        level = root.replace(extract_to, '').count(os.sep)
        indent = ' ' * 4 * level
        print(f"{indent}{os.path.basename(root)}/")
        sub_indent = ' ' * 4 * (level + 1)
        for file in files:
            print(f"{sub_indent}{file}")

# Unzip the model files
unzip_file('saved_models_v3.zip', 'model_v3')
# unzip_file('saved_models_v12.zip', 'model_v12')

Unzipped saved_models_v3.zip to model_v3
Contents:
model_v3/
    fine_tuned_model/
        config.json
        vocab.txt
        tokenizer_config.json
        special_tokens_map.json
        model.safetensors


In [None]:
unzip_file('saved_models_v12.zip', 'model_v12')

Unzipped saved_models_v12.zip to model_v12
Contents:
