In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Load your data here
data = pd.read_csv("llama3csv.csv")

# Assuming data has columns 'text' and 'label'
gr = data["Generated Response"].copy()
ot = data["Original Tweet"].copy()
gr_df = pd.DataFrame({'text': gr, 'label': 1})
ot_df = pd.DataFrame({'text': ot, 'label': 0})
data = pd.concat([gr_df, ot_df], ignore_index=True, sort=False)
data = data.sample(frac=1).reset_index(drop=True)

print(data.head())


                                                text  label
0  ahead cop key paragraph g text set stage actio...      1
1  shameful display pettiness demand collective e...      1
2  whats tip point transform global sustainabilit...      1
3  climate crisis smoke screen real issue uk shoc...      1
4  bad outcome person die million thats question ...      0


In [2]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2, random_state=42, stratify=data["label"])


In [7]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(4, 4), max_features=11000)  # dynamic range of features

X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2, random_state=42, stratify=data["label"])

# Remove NaN values from the split data
X_train = X_train.dropna()
X_test = X_test.dropna()
y_train = y_train[X_train.index]
y_test = y_test[X_test.index]


X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [8]:
# Define hyperparameters for grid search
param_grid_svm_lr = {
    'C': [6.0, 2.0, 1.0, 0.95, 0.9, 0.8]
}
param_grid_rf = {
    'min_samples_split': [8, 32, 128],
    'max_features': [int(np.sqrt(X_train.shape[1])), int(0.02 * X_train.shape[1]), int(0.04 * X_train.shape[1]), int(0.06 * X_train.shape[1])]
}

# Initialize models
svm = SVC(kernel='linear')
lr = LogisticRegression(solver='liblinear', penalty='l2')
rf = RandomForestClassifier(criterion='gini', oob_score=True)


In [9]:
# Grid search for SVM
grid_svm = GridSearchCV(svm, param_grid_svm_lr, cv=5, scoring='accuracy')
grid_svm.fit(X_train, y_train)

# Grid search for Logistic Regression
grid_lr = GridSearchCV(lr, param_grid_svm_lr, cv=5, scoring='accuracy')
grid_lr.fit(X_train, y_train)

# Grid search for Random Forest
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)


In [None]:
# Evaluate SVM
svm_best = grid_svm.best_estimator_
y_pred_svm = svm_best.predict(X_test)
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

# Evaluate Logistic Regression
lr_best = grid_lr.best_estimator_
y_pred_lr = lr_best.predict(X_test)
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

# Evaluate Random Forest
rf_best = grid_rf.best_estimator_
y_pred_rf = rf_best.predict(X_test)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


In [None]:
# Combine model outputs by labeling records as artificial if and only if all models agree
y_pred_combined = (y_pred_svm & y_pred_lr & y_pred_rf)
print("Combined Model Classification Report:\n", classification_report(y_test, y_pred_combined))
print("Combined Model Accuracy:", accuracy_score(y_test, y_pred_combined))
