In [None]:
sample_df = df.sample(n=100000)

# write the sample DataFrame to a CSV file
sample_df.to_csv('100000twtunlabeled.csv', index=False)

# confirm that the new file was created and contains the expected data
new_df = pd.read_csv('100000twtunlabeled.csv')
print(new_df.head())

In [None]:
test_data = pd.read_csv('100000twtfinance.csv', sep=',')

In [None]:
test_data = pd.read_csv('100000twtnotfinance.csv', sep=',')

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score

# Load the labeled training sets and create input tensors
# Assume that the training data is in CSV format with three columns: message_detail, username, and label
train_data_1 = pd.read_csv('139twtfinance.csv', sep=',')
train_data_2 = pd.read_csv('139twtnotfinance.csv', sep=',')

train_data_1 = train_data_1.dropna(subset=['label'])
train_data_2 = train_data_2.dropna(subset=['label'])

print(train_data_1.isnull().sum())  # Check for NaN values
print(train_data_1.dtypes)  # Check the data types of each column

print(train_data_2.isnull().sum())  # Check for NaN values
print(train_data_2.dtypes)  # Check the data types of each column

train_texts_1 = train_data_1["message_detail"].tolist()
train_labels_1 = train_data_1["label"].tolist()
train_texts_2 = train_data_2["message_detail"].tolist()
train_labels_2 = train_data_2["label"].tolist()

# Remove missing or invalid labels
train_labels_1 = [label for label in train_labels_1 if label == label] # Removes NaN values
train_labels_2 = [label for label in train_labels_2 if label == label] # Removes NaN values

# Convert labels to integers (if necessary)
train_labels_1 = [int(label) for label in train_labels_1]
train_labels_2 = [int(label) for label in train_labels_2]

tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')

train_encodings_1 = tokenizer(train_texts_1, truncation=True, padding=True, return_tensors="pt")
train_encodings_2 = tokenizer(train_texts_2, truncation=True, padding=True, return_tensors="pt")

# Check the size of the input tensors
assert train_encodings_1["input_ids"].size(0) == len(train_labels_1), "Size mismatch between input_ids and labels"
assert train_encodings_1["attention_mask"].size(0) == len(train_labels_1), "Size mismatch between attention_mask and labels"
assert train_encodings_2["input_ids"].size(0) == len(train_labels_2), "Size mismatch between input_ids and labels"
assert train_encodings_2["attention_mask"].size(0) == len(train_labels_2), "Size mismatch between attention_mask and labels"

train_dataset_1 = torch.utils.data.TensorDataset(
    train_encodings_1["input_ids"],
    train_encodings_1["attention_mask"],
    torch.tensor(train_labels_1),
)

train_dataset_2 = torch.utils.data.TensorDataset(
    train_encodings_2["input_ids"],
    train_encodings_2["attention_mask"],
    torch.tensor(train_labels_2),
)

# Fine-tune the BERT model on the training sets
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

train_loader_1 = torch.utils.data.DataLoader(train_dataset_1, batch_size=8, shuffle=True)
train_loader_2 = torch.utils.data.DataLoader(train_dataset_2, batch_size=8, shuffle=True)

model.train()
for epoch in range(3):
    for batch in train_loader_1:
        optimizer.zero_grad()
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    for batch in train_loader_2:
        optimizer.zero_grad()
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Load the test set and create input tensors
# Assume that the test data is in CSV format with two columns: message_detail and username
test_data = pd.read_csv('139twtunlabeled.csv', sep=',')
test_texts = test_data["message_detail"].tolist()

test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
test_dataset = torch.utils.data.TensorDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"],
)

# Use the trained model to make predictions on the test set
model.eval()
predictions = []
for batch in torch.utils.data.DataLoader(test_dataset, batch_size=8):
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, axis=1)
        predictions.extend(batch_predictions.detach().cpu().numpy().tolist())

# Print the accuracy of the model
print("Accuracy:", accuracy_score(predictions, [0] * len(predictions)))

In [None]:
# Use the fine-tuned BERT model to predict the labels of the test set
model.eval()
test_set_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=8)
test_set_predictions = []

for batch in test_set_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)

    with torch.no_grad():
        outputs = model(batch_input_ids, attention_mask=batch_attention_masks)

    logits = outputs[0]
    batch_predictions = logits.argmax(dim=-1).tolist()
    test_set_predictions.extend(batch_predictions)
    
# Print the accuracy of the model
print("Accuracy:", accuracy_score(predictions, [0] * len(predictions)))

# Print the Text and the predicted label of each example in the test set
for i, text in enumerate(test_texts):
    label = test_set_predictions[i]
    print(f"{text}\n{label}\n")

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup

# Load the labeled training set and create input tensors
train_data = pd.read_csv('5000_tlg_labeled.csv', sep=';', encoding='ANSI')

# Filter the training data to include only 500 rows for each label
train_data = train_data.groupby('label').head(500)

# Convert labels to integers
label2int = {"J": 0, "M": 1, "T": 2}
train_data['label'] = train_data['label'].map(label2int)

# Get the training texts and labels as lists
train_texts = train_data["content"].tolist()
train_labels = train_data["label"].tolist()

# Load the BERT tokenizer and tokenize the input texts
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")

# Create a PyTorch dataset from the input encodings and labels
train_dataset = TensorDataset(
    train_encodings["input_ids"],
    train_encodings["attention_mask"],
    torch.tensor(train_labels),
)

# Load theBERT model and set the device
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=3)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Define the training hyperparameters
batch_size = 4
learning_rate = 1e-5
num_epochs = 2
warmup_steps = 0.1 * (len(train_dataset) // batch_size)
total_steps = len(train_dataset) // batch_size * num_epochs
num_warmup_steps = int(warmup_steps)
num_training_steps = int(total_steps - num_warmup_steps)

# Create a PyTorch DataLoader for the training set
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

# Create the optimizer and the scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

# Define the loss function and the evaluation metric
loss_fn = torch.nn.CrossEntropyLoss()
metric_fn = torch.nn.functional.softmax

# Training loop
model.train()

for epoch in range(num_epochs):
    for step, batch in enumerate(train_dataloader):
        batch_input_ids = batch[0].to(device)
        batch_attention_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits

        # Calculate loss
        loss = loss_fn(logits, batch_labels)

        # Backward pass
        loss.backward()

        # Clip the gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update the parameters
        optimizer.step()

        # Update the learning rate scheduler
        scheduler.step()

        # Print the loss for every 10th batch
        if step % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Step {step+1}/{len(train_dataloader)}, Loss {loss.item():.4f}")

# Save the fine-tuned model
model.save_pretrained('tlg_labeled')

Some weights of the model checkpoint at dbmdz/bert-base-turkish-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were 

Epoch 1/2, Step 1/225, Loss 0.9259


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the saved model
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=2)
model.load_state_dict(torch.load("tlg_label_trained_model.pth"))
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

train_loader_2 = torch.utils.data.DataLoader(train_dataset_2, batch_size=8, shuffle=True)

model.train()
for epoch in range(3):
    for batch in train_loader_2:
        optimizer.zero_grad()
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the retrained model
torch.save(model.state_dict(), "tlg_label_trained_model.pth")

# Load the test set and create input tensors
test_data = pd.read_csv('139twtunlabeled.csv', sep=',')
test_texts = test_data["message_detail"].tolist()

test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
test_dataset = torch.utils.data.TensorDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"],
)

# Use the retrained model to make predictions on the test set
model.eval()
predictions = []
with torch.no_grad():
    for batch in torch.utils.data.DataLoader(test_dataset, batch_size=8):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, axis=1)
        predictions.extend(batch_predictions.detach().cpu().numpy().tolist())

# Calculate evaluation metrics
true_labels = [0] * len(predictions)  # Replace with the correct true labels

accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
confusion_mat = confusion_matrix(true_labels, predictions)

# Print the evaluation metrics
print("Accuracy on unlabeled test dataset:", accuracy)
print("Precision on unlabeled test dataset:", precision)
print("Recall on unlabeled test dataset:", recall)
print("F1 Score on unlabeled test dataset:", f1)
print("Confusion Matrix:")
print(confusion_mat)

In [None]:
# Load the retrained model
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=2)
model.load_state_dict(torch.load("retrained_model.pth"))
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Load the test set and create input tensors
# Assume that the test data is in CSV format with two columns: message_detail and username
test_data = pd.read_csv('139twtunlabeled.csv', sep=',')
test_texts = test_data["message_detail"].tolist()

test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
test_dataset = torch.utils.data.TensorDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"],
)

# Use the trained model to make predictions on the test set
model.eval()
predictions = []
for batch in torch.utils.data.DataLoader(test_dataset, batch_size=8):
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, axis=1)
        predictions.extend(batch_predictions.detach().cpu().numpy().tolist())

# Print the Text and the predicted label of each example in the test set
for i, text in enumerate(test_texts):
    label = predictions[i]
    print(f"{text}\n{label}\n")


In [None]:
for name, param in model.named_parameters():
    print(f"Parameter name: {name}")
    print()
