In [None]:
import pandas as pd

# Load your DataFrame
df = pd.read_csv('/content/drive/MyDrive/subset result.csv') # Example if your dataset is in a CSV file
df['item_1A'] = df['item_1A'].astype(str)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

train_df, val_df = train_test_split(df, test_size=0.2, random_state=29)

df_majority = train_df[train_df.high_90 == 1]
df_minority = train_df[train_df.high_90 == 0]

# Downsample the majority class
df_majority_downsampled = resample(df_majority,
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_minority),     # to match minority class
                                   random_state=29) # reproducible results

# Combine minority class with downsampled majority class
train_df = pd.concat([df_majority_downsampled, df_minority])

train_df = resample(train_df,
                                   replace=False,    # sample without replacement
                                   n_samples=len(train_df),     # to match minority class
                                   random_state=29) # reproducible results

# Display new class counts
print(train_df.high_90.value_counts())

1    271
0    271
Name: high_90, dtype: int64


In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

def tokenize_function(examples):
    return tokenizer(examples['item_1A'], padding='max_length', truncation=True, max_length=512)

train_dataset = train_df.apply(tokenize_function, axis=1)
val_dataset = val_df.apply(tokenize_function, axis=1)


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

In [None]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['high_90'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train = TextDataset(train_dataset, train_df['high_90'].tolist())
val = TextDataset(val_dataset, val_df['high_90'].tolist())

In [None]:
train_attention_mask = []
train_input_ids = []
for element in train_dataset:
  train_attention_mask.append(element['attention_mask'])
  train_input_ids.append(element['input_ids'])

test_attention_mask = []
test_input_ids = []
for element in val_dataset:
  test_attention_mask.append(element['attention_mask'])
  test_input_ids.append(element['input_ids'])

In [None]:
df_train = pd.DataFrame({
    'input_ids': train_input_ids,
    'attention_mask': train_attention_mask,
    'labels': train.labels
})

df_test = pd.DataFrame({
    'input_ids': test_input_ids,
    'attention_mask': test_attention_mask,
    'labels': val.labels
})

In [None]:
dataset = []
for _, row in df_train.iterrows():
    # Creating a dictionary for each row
    data_point = {
        'input_ids': torch.tensor(row['input_ids']),  # Convert to tensor
        'attention_mask': torch.tensor(row['attention_mask']),  # Convert to tensor
        'labels': row['labels']  # Assuming labels are already in the correct format
    }
    dataset.append(data_point)

test_dataset = []
for _, row in df_test.iterrows():
    # Creating a dictionary for each row
    data_point = {
        'input_ids': torch.tensor(row['input_ids']),  # Convert to tensor
        'attention_mask': torch.tensor(row['attention_mask']),  # Convert to tensor
        'labels': row['labels']  # Assuming labels are already in the correct format
    }
    test_dataset.append(data_point)

In [None]:
def create_batches(dataset, batch_size):
    batches = []
    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i + batch_size]
        batch_input_ids = torch.stack([item['input_ids'] for item in batch])
        batch_attention_mask = torch.stack([item['attention_mask'] for item in batch])
        batch_labels = torch.tensor([item['labels'] for item in batch])

        batches.append({
            'input_ids': batch_input_ids,
            'attention_mask': batch_attention_mask,
            'labels': batch_labels
        })
    return batches

# Example usage
batch_size = 4
batches = create_batches(dataset, batch_size)
test_batches = create_batches(test_dataset, batch_size)

In [None]:
# Model and optimizer
from transformers import AdamW, DistilBertForSequenceClassification

actual_labels_train = train_df['high_90'].to_list()
actual_labels_val = val_df['high_90'].to_list()

train_loss_list = []
val_loss_list = []

for lr in [1e-5]:
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-cased',
        output_hidden_states=True,
        seq_classif_dropout=0.2)
    optimizer = AdamW(model.parameters(), lr=lr)

    # Number of training epochs
    num_epochs = 4

    # Training loop

    train_loss = []
    val_loss = []

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        all_predictions = []

        for batch in batches:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            predicted_labels = outputs.logits.argmax(dim=1)
            all_predictions.extend(predicted_labels.tolist())
            # print(predicted_labels)

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(batches)
        train_loss.append(avg_train_loss)
        print(f"Epoch {epoch}: Average Training Loss: {avg_train_loss}")

        correct_predictions = sum(a == p for a, p in zip(actual_labels_train, all_predictions))

        # Calculate accuracy
        accuracy = correct_predictions / len(actual_labels_train)
        print("Accuracy:", accuracy)

        # Validation loop
        model.eval()
        total_eval_loss = 0
        all_predictions = []
        for batch in test_batches:
            with torch.no_grad():
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels']

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                predicted_labels = outputs.logits.argmax(dim=1)
                all_predictions.extend(predicted_labels.tolist())

                total_eval_loss += loss.item()

        avg_val_loss = total_eval_loss / len(test_batches)
        val_loss.append(avg_val_loss)
        print(f"Epoch {epoch}: Average Validation Loss: {avg_val_loss}")

        correct_predictions = sum(a == p for a, p in zip(actual_labels_val, all_predictions))

        # Calculate accuracy
        accuracy = correct_predictions / len(actual_labels_val)
        print("Accuracy:", accuracy)
        print(all_predictions)

    train_loss_list.append(train_loss)
    val_loss_list.append(val_loss)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0: Average Training Loss: 0.696720231543569
Accuracy: 0.503690036900369
Epoch 0: Average Validation Loss: 0.6850865864753723
Accuracy: 0.545
[1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0]
Epoch 1: Average Training Loss: 0.6660935510607326
Accuracy: 0.6217712177121771
Epoch 1: Average Validation Loss: 0.6293311667442322
Accuracy: 0.63
[1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1