In [1]:
import os
import joblib

project_dir = os.getcwd()
data_dir = os.path.join(project_dir, "data")
model_dir = os.path.join(project_dir, "model")

In [2]:
import pandas as pd
from tqdm import tqdm

pd.options.display.max_colwidth = 255
tqdm.pandas()

In [3]:
df = pd.read_pickle(f"{data_dir}/preprocessed.pkl")

In [4]:
df.head()

Unnamed: 0,id,title_tokenized,body_tokenized,tags
0,80,"[multiple, queries, one, statement]","[written, database, generation, script, SQL, and, want, execute, Adobe, AIR, application, Create, Table, tRole, roleID, integer, Primary, Key, roleName, varchar, Create, Table, tFile, fileID, integer, Primary, Key, fileName, varchar, fileDescription, ...","[flex, actionscript-3, air]"
1,90,"[Good, branching, and, merging, tutorials, for, TortoiseSVN]","[Are, there, any, really, good, tutorials, explaining, branching, and, merging, with, Apache, Subversion, All, the, better, specific, TortoiseSVN, client]","[svn, tortoisesvn, branch, branching-and-merging]"
2,120,"[Site, Maps]","[Has, anyone, got, experience, creating, providers, got, the, default, XML, file, working, properly, with, Menu, and, SiteMapPath, controls, but, need, way, for, the, users, site, create, and, modify, pages, dynamically, need, tie, page, viewing, perm...","[sql, asp.net, sitemap]"
3,180,"[Function, for, creating, color, wheels]","[This, something, many, times, and, never, quite, found, solution, That, stuck, with, The, problem, come, with, way, generate, colors, that, are, distinguishable, possible, where, parameter]","[algorithm, language-agnostic, colors, color-space]"
4,260,"[Adding, scripting, functionality, applications]","[have, little, game, written, C, #, uses, database, trading, card, game, and, wanted, implement, the, function, the, cards, script, What, mean, that, essentially, have, interface, ICard, which, card, class, implements, public, class, ICard, and, which...","[c#, .net, scripting, compiler-construction]"


### Number of tags (i.e. classes)

In [5]:
from collections import Counter

tag_count = Counter()

def count_tag(tags):
    for tag in tags:
        tag_count[tag] += 1

df["tags"].apply(count_tag)

len(tag_count.values())

38146

In [6]:
most_common_tags = [count[0] for count in tag_count.most_common(20)]
df["tags"] = df["tags"].progress_apply(lambda tags: [tag for tag in tags if tag in most_common_tags])

100%|████████████████████████████████████████████████████████████████████| 1264216/1264216 [00:04<00:00, 285746.00it/s]


In [7]:
df[df["tags"].map(lambda tags: len(tags) > 0)].shape

(850988, 4)

In [8]:
print(f"Only {1264216 - 850988:,} rows of data will be dropped while number of classes is reduced from {len(tag_count.values()):,} to 20!")

Only 413,228 rows of data will be dropped while number of classes is reduced from 38,146 to 20!


In [9]:
df = df[df["tags"].map(lambda tags: len(tags) > 0)]

### Untokenize text in order to use LLM-specific tokenizer

In [10]:
 def untokenize(text):
    untokenized_text = ' '.join([word for word in text])
    return untokenized_text

In [11]:
df['body'] = df['body_tokenized'].apply(untokenize)
df['title'] = df['title_tokenized'].apply(untokenize)

In [12]:
df.head()

Unnamed: 0,id,title_tokenized,body_tokenized,tags,body,title
2,120,"[Site, Maps]","[Has, anyone, got, experience, creating, providers, got, the, default, XML, file, working, properly, with, Menu, and, SiteMapPath, controls, but, need, way, for, the, users, site, create, and, modify, pages, dynamically, need, tie, page, viewing, perm...","[sql, asp.net]",Has anyone got experience creating providers got the default XML file working properly with Menu and SiteMapPath controls but need way for the users site create and modify pages dynamically need tie page viewing permissions into the standard membershi...,Site Maps
4,260,"[Adding, scripting, functionality, applications]","[have, little, game, written, C, #, uses, database, trading, card, game, and, wanted, implement, the, function, the, cards, script, What, mean, that, essentially, have, interface, ICard, which, card, class, implements, public, class, ICard, and, which...","[c#, .net]",have little game written C # uses database trading card game and wanted implement the function the cards script What mean that essentially have interface ICard which card class implements public class ICard and which contains function that are called ...,Adding scripting functionality applications
5,330,"[Should, use, nested, classes, this, case]","[working, collection, classes, used, for, video, playback, and, recording, have, one, main, class, which, acts, like, the, public, interface, with, methods, like, play, stop, pause, record, etc, Then, have, workhorse, classes, which, the, video, decod...",[c++],working collection classes used for video playback and recording have one main class which acts like the public interface with methods like play stop pause record etc Then have workhorse classes which the video decoding and video encoding just learned...,Should use nested classes this case
6,470,"[Homegrown, consumption, web, services]","[been, writing, few, web, services, for, .net, app, now, ready, consume, them, seen, numerous, examples, where, there, homegrown, code, for, consuming, the, service, opposed, using, the, auto, generated, methods, Visual, Studio, creates, when, adding,...",[.net],been writing few web services for .net app now ready consume them seen numerous examples where there homegrown code for consuming the service opposed using the auto generated methods Visual Studio creates when adding the web reference there some advan...,Homegrown consumption web services
8,650,"[Automatically, update, version, number]","[would, like, the, version, property, application, incremented, for, each, build, but, not, sure, how, enable, this, functionality, Visual, Studio, have, tried, specify, the, AssemblyVersion, but, does, get, exactly, what, want, also, using, settings,...",[c#],would like the version property application incremented for each build but not sure how enable this functionality Visual Studio have tried specify the AssemblyVersion but does get exactly what want also using settings file and earlier attempts when th...,Automatically update version number


### Concatenate 'title' and 'body' columns

In [13]:
df['text'] = df['title'] + ' ' + df['body']

#### Drop unwanted columns

In [14]:
del df['title']
del df['title_tokenized']
del df['body']
del df['body_tokenized']

In [15]:
X = df[["text"]] 
y = df[["tags"]]

In [16]:
from sklearn.preprocessing import MultiLabelBinarizer

multi_label_binarizer = MultiLabelBinarizer()
y = multi_label_binarizer.fit_transform(y["tags"])

### Train, val, test split

In [17]:
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, random_state=0)

In [19]:
import numpy as np 

# Step 1: Sum up the one-hot encoded vectors to get the count for each class
train_class_counts = np.sum(y_train, axis=0)
test_class_counts = np.sum(y_test, axis=0)

# Step 2: Calculate the percentage distribution for each class
total_train_instances = y_train.shape[0]
total_test_instances = y_test.shape[0]

train_class_distribution = train_class_counts / total_train_instances * 100
test_class_distribution = test_class_counts / total_test_instances * 100

# Print the distributions
print("Train Set Class Distribution (%):")
for class_idx, percentage in enumerate(train_class_distribution):
    print(f"Class {class_idx}: {percentage:.2f}%")

print("\nTest Set Class Distribution (%):")
for class_idx, percentage in enumerate(test_class_distribution):
    print(f"Class {class_idx}: {percentage:.2f}%")

Train Set Class Distribution (%):
Class 0: 2.83%
Class 1: 10.67%
Class 2: 2.39%
Class 3: 3.52%
Class 4: 2.73%
Class 5: 11.89%
Class 6: 5.59%
Class 7: 4.97%
Class 8: 6.94%
Class 9: 5.52%
Class 10: 2.53%
Class 11: 13.52%
Class 12: 14.60%
Class 13: 9.21%
Class 14: 4.99%
Class 15: 3.17%
Class 16: 11.61%
Class 17: 7.59%
Class 18: 3.03%
Class 19: 4.18%

Test Set Class Distribution (%):
Class 0: 2.83%
Class 1: 10.60%
Class 2: 2.41%
Class 3: 3.53%
Class 4: 2.72%
Class 5: 11.90%
Class 6: 5.59%
Class 7: 4.98%
Class 8: 6.90%
Class 9: 5.56%
Class 10: 2.53%
Class 11: 13.62%
Class 12: 14.54%
Class 13: 9.30%
Class 14: 5.00%
Class 15: 3.13%
Class 16: 11.62%
Class 17: 7.58%
Class 18: 3.02%
Class 19: 4.30%


In [20]:
X_val, X_test, y_val, y_test = train_test_split(X, y, test_size=test_ratio/(test_ratio + val_ratio), random_state=0)

In [21]:
import numpy as np 

# Step 1: Sum up the one-hot encoded vectors to get the count for each class
val_class_counts = np.sum(y_val, axis=0)
test_class_counts = np.sum(y_test, axis=0)

# Step 2: Calculate the percentage distribution for each class
total_val_instances = y_val.shape[0]
total_test_instances = y_test.shape[0]

val_class_distribution = val_class_counts / total_val_instances * 100
test_class_distribution = test_class_counts / total_test_instances * 100

# Print the distributions
print("Train Set Class Distribution (%):")
for class_idx, percentage in enumerate(val_class_distribution):
    print(f"Class {class_idx}: {percentage:.2f}%")

print("\nTest Set Class Distribution (%):")
for class_idx, percentage in enumerate(test_class_distribution):
    print(f"Class {class_idx}: {percentage:.2f}%")

Train Set Class Distribution (%):
Class 0: 2.82%
Class 1: 10.69%
Class 2: 2.39%
Class 3: 3.49%
Class 4: 2.72%
Class 5: 11.87%
Class 6: 5.56%
Class 7: 4.95%
Class 8: 6.92%
Class 9: 5.51%
Class 10: 2.54%
Class 11: 13.49%
Class 12: 14.57%
Class 13: 9.23%
Class 14: 4.98%
Class 15: 3.18%
Class 16: 11.62%
Class 17: 7.66%
Class 18: 3.04%
Class 19: 4.18%

Test Set Class Distribution (%):
Class 0: 2.84%
Class 1: 10.61%
Class 2: 2.39%
Class 3: 3.56%
Class 4: 2.74%
Class 5: 11.91%
Class 6: 5.62%
Class 7: 4.99%
Class 8: 6.94%
Class 9: 5.54%
Class 10: 2.53%
Class 11: 13.59%
Class 12: 14.60%
Class 13: 9.23%
Class 14: 5.00%
Class 15: 3.15%
Class 16: 11.60%
Class 17: 7.52%
Class 18: 3.02%
Class 19: 4.23%


### Preprocess the data for LLM

In [22]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

#### Revisit our data

In [24]:
y_train = y_train.tolist()
y_train = pd.DataFrame({"tags": y_train})

y_val = y_val.tolist()
y_val = pd.DataFrame({"tags": y_val})

y_test = y_test.tolist()
y_test = pd.DataFrame({"tags": y_test})

X_train = X_train.reset_index()
X_train = X_train.drop('index', axis=1)

X_val = X_val.reset_index()
X_val = X_val.drop('index', axis=1)

X_test = X_test.reset_index()
X_test = X_test.drop('index', axis=1)

train_data = pd.concat([(X_train.join(y_train))])
val_data = pd.concat([(X_val.join(y_val))])
test_data = pd.concat([(X_test.join(y_test))])

#### Tokenizing process uses ALL available RAM resources. I'm trying to do “all data at once”. Instead, need to chunk the data into batches and process them one by one.

In [25]:
def batched_encoding(text, tags, batch_size):
    num_text = len(text)
    batched_encodings = []
    batched_tags = [] 
    for start in range(0, num_text, batch_size):
        batch_texts = text[start:start + batch_size]
        batch_tags = tags[start:start + batch_size]

        batch_encodings = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt')
        batched_encodings.append(batch_encodings)
        
        batched_tags.append(batch_tags)
    return batch_encodings, batch_tags

In [26]:
X_train.rename(columns={'text': 'texts'}, inplace=True)
X_val.rename(columns={'text': 'texts'}, inplace=True)
X_test.rename(columns={'text': 'texts'}, inplace=True)

In [28]:
import torch

train_encodings, train_tags = batched_encoding(X_train['texts'].tolist(), y_train['tags'].tolist(), batch_size=16)

In [None]:
input_train_encodings = {
    'input_ids': torch.cat([batch['input_ids'] for batch in train_encodings], dim=0),
    'attention_mask': torch.cat([batch['attention_mask'] for batch in train_encodings], dim=0),
}
tags_train_tensor = torch.tensor([tag for batch_tags in batched_tags for tag in train_tags])

In [None]:
val_encodings, val_tags = batched_encoding(X_val['texts'].tolist(), y_val['tags'].tolist(), batch_size=16)

input_val_encodings = {
    'input_ids': torch.cat([batch['input_ids'] for batch in val_encodings], dim=0),
    'attention_mask': torch.cat([batch['attention_mask'] for batch in val_encodings], dim=0),
}
tags_val_tensor = torch.tensor([tag for batch_tags in batched_tags for tag in val_tags])

In [None]:
test_encodings, test_tags = batched_encoding(X_test['texts'].tolist(), y_test['tags'].tolist(), batch_size=16)

input_test_encodings = {
    'input_ids': torch.cat([batch['input_ids'] for batch in test_encodings], dim=0),
    'attention_mask': torch.cat([batch['attention_mask'] for batch in test_encodings], dim=0),
}
tags_test_tensor = torch.tensor([tag for batch_tags in batched_tags for tag in test_tags])

In [None]:
from torch.utils.data import Dataset

class StackSampleDataset(Dataset):
    def __init__(self, encodings, tags):
        self.encodings = encodings
        self.tags = tags

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.tags[idx]
        return item

In [None]:
train_batch_size = 8
eval_batch_size = 4

In [None]:
training_data = StackSampleDataset(train_encodings, train_tags)
evaluation_data = StackSampleDataset(input_val_encodings, tags_val_tensor)
testing_data = StackSampleDataset(input_test_encodings, tags_test_tensor)

In [None]:
from torch.utils.data import DataLoader

traindata_loader = DataLoader(training_data, batch_size=train_batch_size, shuffle=True)
evaldata_loader = DataLoader(evaluation_data, batch_size=eval_batch_size, shuffle=True)
testdata_loader = DataLoader(test_data, batch_size=eval_batch_size, shuffle=True)

### Defining model and train

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=20)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(traindata_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in traindata_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

### Evaluate the model

In [None]:
import evaluate

precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1-score = evaluate.load("f1-score")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

precision.compute()
recall.compute()
f1-score.compute()