In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

In [None]:
data = pd.read_csv('../../preprocessing/StorePreprocessed/GPT35csv.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2, random_state=42, stratify=data["label"])

In [None]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(4, 4), max_features=11000)  # dynamic range of features

In [None]:
# Remove NaN values from the split data
X_train = X_train.dropna()
X_test = X_test.dropna()
y_train = y_train[X_train.index]
y_test = y_test[X_test.index]


X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
param_grid_lr = {
    'C': [6.0, 2.0, 1.0, 0.95, 0.9, 0.8]
}
param_grid_svm = {
    'C': [6.0, 2.0, 1.0, 0.95, 0.9, 0.8]
}
param_grid_rf = {
    'min_samples_split': [8, 32, 128],
    'max_features': [int(np.sqrt(X_train.shape[1])), int(0.02 * X_train.shape[1]), int(0.04 * X_train.shape[1]), int(0.06 * X_train.shape[1])]
}

In [None]:
lr = LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=0.5)
rf = RandomForestClassifier(criterion="gini", oob_score=True)
svm = SVC(kernel="linear")

In [None]:
# Grid search for SVM
grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy')
grid_svm.fit(X_train, y_train)

# Grid search for Logistic Regression
grid_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='accuracy')
grid_lr.fit(X_train, y_train)

# Grid search for Random Forest
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)

In [None]:
svm_best = grid_svm.best_estimator_
y_pred_svm = svm_best.predict(X_test)

lr_best = grid_lr.best_estimator_
y_pred_lr = lr_best.predict(X_test)

rf_best = grid_rf.best_estimator_
y_pred_rf = rf_best.predict(X_test)

In [None]:
y_pred_combined = (y_pred_svm & y_pred_lr & y_pred_rf)
print("Combined Model Classification Report:\n", classification_report(y_test, y_pred_combined))
print("Combined Model Accuracy:", accuracy_score(y_test, y_pred_combined))