# Phase 1 

In [13]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from utils import CircuitDataset

column_names = ["Cause", "Action"]
df = pd.read_csv("../../data/combined_data.csv", header = None, names= column_names)
print(f"loaded {len(df)} records")

df['Action'] = df['Action'].fillna('')
df = df.sample( n = 3000, random_state = 42).reset_index(drop = True)
print(f"running on a small sample of {len(df)} records for testing")

loaded 28613 records
running on a small sample of 3000 records for testing


In [14]:
unique_cases = df["Cause"].unique().tolist()

label2id = {label: i for i, label in enumerate(unique_cases)}
id2label = {i: label for i, label in enumerate(unique_cases)}

df["Labels"] = df["Cause"].map(label2id)

print("labels encoded")
print(f"Mapping: {label2id}")

labels encoded
Mapping: {'Isolation irregularity': 0, 'No fault found': 1, 'Suspect bird strike': 2, 'Overload (class 90)': 3, 'Ole equipment failure': 4, 'Animals incursion': 5, 'Bird strike': 6, 'Operational error': 7, 'Overload': 8, 'Historic fault closed': 9, 'Other discipline/ third party': 10, 'Suspect vegetation': 11, 'Suspect faulty train unit': 12, 'Foreign object': 13, 'Distribution': 14, 'Vegetation': 15, 'Weather': 16, 'Faulty train unit': 17, 'Bird nest': 18, 'Insulator': 19, 'Water ingress': 20, 'Miscellaneous': 21, 'Icicles': 22}


In [15]:


train_df, temp_df = train_test_split(
    df,
    test_size = 0.2,
    random_state= 42,
    stratify= df["Labels"]
)

val_df, test_df = train_test_split(
    temp_df,
    test_size= 0.5,
    random_state= 42,
    stratify= temp_df["Labels"]
)
print( f"training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

training set size: 2400
Validation set size: 300
Test set size: 300


# Phase 2


# tokensation

In [16]:

# load tokeniser
MODEL_NAME = "distilbert-base-uncased"
tokeniser = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenis the text from each split

train_encodings = tokeniser(train_df['Action'].tolist(), truncation = True, padding = True, max_length = 128)
val_encodings = tokeniser(val_df['Action'].tolist(), truncation = True, padding = True, max_length = 128)
test_encodings = tokeniser(test_df['Action'].tolist(), truncation = True, padding = True, max_length = 128)

print("Tokenisation complete")

Tokenisation complete


In [None]:
train_dataset = CircuitDataset(train_encodings, train_df["Labels"].tolist())
val_dataset = CircuitDataset(val_encodings, val_df["Labels"].tolist())
test_dataset = CircuitDataset(test_encodings, test_df["Labels"].tolist())
print("Datasets created")

datasets created
