# Objectives

# 1. Train Logistic Regression.
# 2. Train BERT.

In [11]:
import numpy as np
from datasets import Dataset
from scipy.sparse import load_npz
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding


In [13]:
# Using Logistic Regression

# Load processed training and test data
X_train_vect = load_npz('../data/processed/X_train_vect.npz')
y_train = np.load('../data/processed/y_train.npy')
X_test_vect = load_npz('../data/processed/X_test_vect.npz')
y_test = np.load('../data/processed/y_test.npy')

log_reg = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

log_reg.fit(X_train_vect, y_train)

# Predict on the test set
y_pred_log_reg = log_reg.predict(X_test_vect)

# Evaluate the Logistic Regression model
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))


Logistic Regression Performance:
Accuracy: 0.779303125
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.76      0.77    160000
           1       0.77      0.80      0.78    160000

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000

Confusion Matrix:
[[121188  38812]
 [ 31811 128189]]


In [14]:
X_train = np.load('../data/processed/X_train.npy', allow_pickle=True)  
X_test = np.load('../data/processed/X_test.npy', allow_pickle=True)    
y_train = np.load('../data/processed/y_train.npy')
y_test = np.load('../data/processed/y_test.npy')  

# Check data loading
print(f"Loaded {len(X_train)} training samples and {len(X_test)} test samples")

Loaded 1280000 training samples and 320000 test samples


In [22]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function for preparing data for BERT model
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128, return_tensors="pt")

# Convert data to Hugging Face's Dataset format
train_data = Dataset.from_dict({'text': X_train, 'label': y_train})
test_data = Dataset.from_dict({'text': X_test, 'label': y_test})

# Tokenize the data (this will be done on the 'text' column of the dataset)
train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

# Save the tokenized data
train_data.save_to_disk('../data/processed/train_data_bert')
test_data.save_to_disk('../data/processed/test_data_bert')


Map: 100%|██████████| 1280000/1280000 [24:54<00:00, 856.44 examples/s] 
Map: 100%|██████████| 320000/320000 [08:35<00:00, 620.20 examples/s]
Saving the dataset (3/3 shards): 100%|██████████| 1280000/1280000 [00:15<00:00, 84847.91 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 320000/320000 [00:04<00:00, 71524.30 examples/s] 


In [15]:
data_collator = DataCollatorWithPadding(tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'))
train_data = Dataset.load_from_disk('../data/processed/train_data_bert')
test_data = Dataset.load_from_disk('../data/processed/test_data_bert')

# Load BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# Set up training arguments
training_args = TrainingArguments(
    output_dir='../results',          
    num_train_epochs=3,              
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=8,  
    warmup_steps=500,                
    weight_decay=0.01,              
    logging_dir='./logs',            
    logging_steps=10,
    use_cpu=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                        
    args=training_args,                 
    train_dataset=train_data,           
    eval_dataset=test_data,             
    data_collator=data_collator,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.predictions.argmax(axis=-1), p.label_ids)
    },
)



# Train the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(test_data)
y_pred = np.argmax(predictions.predictions, axis=-1)  # Convert logits to predicted class labels
y_true = predictions.label_ids  

accuracy = (y_pred == y_true).mean()

report = classification_report(y_true, y_pred, target_names=['Negative', 'Positive'])

conf_matrix = confusion_matrix(y_true, y_pred)

print("BERT Model Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)
print("\nConfusion Matrix:")
print(conf_matrix)