In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import torch

# Load the dataset
train_df = pd.read_csv("C:/cse_584/midterm/data/train_df.csv")
test_df = pd.read_csv("C:/cse_584/midterm/data/test_df.csv")

# Split the data into features (X) and target (y)
X_train = train_df[['input_text', 'generated_text']] 
y_train = train_df['model_name']  
X_test = test_df[['input_text', 'generated_text']]
y_test = test_df['model_name']



In [4]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to('cuda' if torch.cuda.is_available() else 'cpu')

# Clean the dataset to ensure all values in 'generated_text' are valid strings
X_train = X_train[X_train['generated_text'].notna()]  # Remove rows with NaN in 'generated_text'
X_train['generated_text'] = X_train['generated_text'].astype(str)  # Convert all to strings

# Function to extract BERT embeddings in batches
def get_bert_embeddings_in_batches(text_list, batch_size=64):
    all_embeddings = []
    # Process data in batches to avoid memory issues
    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i:i+batch_size]
        tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to('cuda' if torch.cuda.is_available() else 'cpu')
        with torch.no_grad():
            outputs = bert_model(**tokens)
        # Use CLS token embedding (outputs of the first token)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(batch_embeddings)
    return np.vstack(all_embeddings)

# Get BERT embeddings for both input_text and generated_text columns in batches
X_train_input_bert = get_bert_embeddings_in_batches(X_train['input_text'].tolist(), batch_size=64)
X_train_generated_bert = get_bert_embeddings_in_batches(X_train['generated_text'].tolist(), batch_size=64)
X_test_input_bert = get_bert_embeddings_in_batches(X_test['input_text'].tolist(), batch_size=64)
X_test_generated_bert = get_bert_embeddings_in_batches(X_test['generated_text'].tolist(), batch_size=64)

# Combine input and generated text embeddings
X_train_combined_bert = np.hstack([X_train_input_bert, X_train_generated_bert])
X_test_combined_bert = np.hstack([X_test_input_bert, X_test_generated_bert])


KeyboardInterrupt: 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Train a classifier (Logistic Regression in this case)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_combined_bert, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined_bert)

# Evaluate with F1-score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1-score: {f1}")

# Generate the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()
