## Load the expert data

In [14]:
import pandas as pd

# Specify the path to the CSV file
file_path = r"C:\Users\saura\Desktop\Thesis\1000 - Data\Kaggle dataset\Annotated_data.csv"
# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)
# Keep only the desired columns
df = df[["Id_Number", "Distorted part", "Dominant Distortion", "Secondary Distortion (Optional)"]]

In [15]:

# Concatenate the 'Dominant Distortion' and 'Secondary Distortion (Optional)' to gather all possible labels
all_distortions = pd.concat([df['Dominant Distortion'], df['Secondary Distortion (Optional)']])

# Create a dictionary mapping each unique label to a unique integer
distortion_to_id = {distortion: i for i, distortion in enumerate(all_distortions.dropna().unique())}

# Map the text labels in 'Dominant Distortion' to their corresponding numeric ids
# Replace the text labels with their corresponding numeric ids
df['Dominant Distortion'] = df['Dominant Distortion'].map(distortion_to_id).astype(pd.Int64Dtype())
df['Secondary Distortion (Optional)'] = df['Secondary Distortion (Optional)'].map(distortion_to_id).astype(pd.Int64Dtype())



In [16]:
print(df['Dominant Distortion'].unique())
print(df['Secondary Distortion (Optional)'].unique())


<IntegerArray>
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Length: 11, dtype: Int64
<IntegerArray>
[<NA>, 9, 7, 6, 8, 0, 5, 1, 10, 4, 3]
Length: 11, dtype: Int64


In [17]:
import numpy as np

num_labels = len(distortion_to_id)

# Initialize label vectors as lists of zeros
df['label_vector'] = df.apply(lambda row: [0]*num_labels, axis=1)

# Update vectors with dominant and secondary labels
for index, row in df.iterrows():
    # Check and update for dominant distortion if it's not NaN
    if pd.notnull(row['Dominant Distortion']):
        df.at[index, 'label_vector'][int(row['Dominant Distortion'])] = 1
    # Check and update for secondary distortion if it's not NaN
    if pd.notnull(row['Secondary Distortion (Optional)']):
        df.at[index, 'label_vector'][int(row['Secondary Distortion (Optional)'])] = 1


In [18]:
from transformers import XLNetTokenizer
import torch

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

def tokenize_data(text):
    if pd.notnull(text):
        return tokenizer.encode_plus(text, max_length=512, truncation=True, padding='max_length', return_tensors="pt")
    else:
        return {'input_ids': torch.tensor([0]*512), 'attention_mask': torch.tensor([0]*512)}  # Default values for missing text

# Apply tokenization and store results in separate columns
df['encodings'] = df['Distorted part'].apply(tokenize_data)
df['input_ids'] = df['encodings'].apply(lambda x: x['input_ids'].squeeze(0))  # Remove batch dimension
df['attention_mask'] = df['encodings'].apply(lambda x: x['attention_mask'].squeeze(0))





In [19]:
from sklearn.model_selection import train_test_split
# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['Dominant Distortion'])

# Split the train set into train and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df['Dominant Distortion'])



## Setting up and training XLNet

In [20]:
import torch

if torch.cuda.is_available():
    print("CUDA available. Using GPU.")
    device = torch.device("cuda")
else:
    print("CUDA not available. Using CPU.")
    device = torch.device("cpu")


CUDA available. Using GPU.


In [21]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings  # encodings is expected to be a dictionary
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

    def __len__(self):
        return len(self.labels)

# Prepare datasets by combining inputs into a dictionary
train_encodings = {'input_ids': torch.stack(train_df['input_ids'].tolist()), 
                   'attention_mask': torch.stack(train_df['attention_mask'].tolist())}
train_labels = train_df['label_vector'].tolist()

val_encodings = {'input_ids': torch.stack(val_df['input_ids'].tolist()), 
                 'attention_mask': torch.stack(val_df['attention_mask'].tolist())}
val_labels = val_df['label_vector'].tolist()

test_encodings = {'input_ids': torch.stack(test_df['input_ids'].tolist()), 
                  'attention_mask': torch.stack(test_df['attention_mask'].tolist())}
test_labels = test_df['label_vector'].tolist()

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)
test_dataset = TextDataset(test_encodings, test_labels)


In [22]:
from transformers import XLNetForSequenceClassification, XLNetTokenizer, AdamW

model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=len(distortion_to_id))
model = model.to(device)



Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)


In [24]:
from transformers import Trainer, TrainingArguments
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluate and save checkpoint every epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=8, # batch size per device during training
    per_device_eval_batch_size=8,  # batch size for evaluation
    num_train_epochs=5,              # total number of training epochs
    gradient_accumulation_steps=4,
    warmup_steps=0,                  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,              # log after every n steps
    save_total_limit=5,              # number of total models to save
    save_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, None) 
)




In [25]:
from transformers import set_seed, Trainer, TrainingArguments

set_seed(42)  # Set seed for reproducibility
training_args.fp16 = True  # Enable mixed precision


In [26]:

# torch.cuda.empty_cache()


In [27]:
trainer.train()


  0%|          | 0/320 [00:00<?, ?it/s]

{'loss': 0.5729, 'grad_norm': 2.8812711238861084, 'learning_rate': 1.9375e-05, 'epoch': 0.16}


KeyboardInterrupt: 

In [None]:
watch -n1 nvidia-smi


SyntaxError: invalid syntax (2126546691.py, line 1)