In [2]:
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv('roberta_embeddings_english_abstractive.csv')

In [6]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,Judgement Status
0,-0.045318,0.106291,-0.003215,-0.067849,0.027385,-0.127473,-0.029526,0.032465,0.032632,-0.086068,...,-0.050516,-0.105249,-0.060616,0.014517,0.111777,0.058287,-0.074195,0.009236,0.020355,1
1,-0.051724,0.116939,0.019695,-0.105494,0.033264,-0.070311,-0.006279,0.061977,0.007398,-0.091103,...,-0.015868,-0.088793,-0.080782,0.007715,0.105455,0.058583,-0.040270,0.008042,-0.013936,2
2,-0.010546,0.108884,0.001479,-0.138257,0.051355,-0.135540,0.008098,0.051294,0.010994,-0.042763,...,0.014642,-0.096850,-0.071336,-0.050518,0.096773,0.024355,-0.091703,-0.011808,0.005190,0
3,-0.054942,0.087816,0.019472,-0.118021,0.059199,-0.082324,-0.024870,0.060929,0.012410,-0.098716,...,-0.011953,-0.098715,-0.092882,-0.004579,0.122452,-0.007312,-0.053264,-0.000205,0.005530,0
4,-0.026756,0.077807,-0.000185,-0.094653,0.057685,-0.071129,-0.000685,0.097987,0.053599,-0.065829,...,-0.013227,-0.050602,-0.054803,-0.005757,0.072578,0.028736,-0.040133,-0.029626,0.037816,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,-0.052280,0.063353,-0.029092,-0.112168,0.026026,-0.086139,-0.025001,0.079336,0.038220,-0.089051,...,0.003848,-0.049991,-0.083095,0.011732,0.136884,0.000406,-0.103072,-0.027759,-0.016507,3
596,-0.040454,0.080203,-0.015394,-0.101792,0.036181,-0.041597,0.007757,0.047865,0.008642,-0.091039,...,0.021505,-0.032031,-0.064167,0.021279,0.122567,0.013715,-0.076298,-0.009738,-0.027242,2
597,-0.067318,0.111115,-0.009719,-0.082606,0.054640,-0.103232,-0.002115,0.021089,0.034756,-0.104129,...,0.019006,-0.062294,-0.049686,0.042695,0.105910,0.013915,-0.079819,-0.008745,0.007531,2
598,-0.064715,0.088847,-0.013383,-0.087596,0.054902,-0.089861,-0.038170,0.046950,-0.015348,-0.094655,...,0.003160,-0.061830,-0.096422,-0.021497,0.139260,0.059158,-0.117424,-0.034884,-0.042829,3


In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target (last column)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models to test
models = {
    "Logistic Regression": LogisticRegression(max_iter=5000, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Support Vector Machine": SVC(kernel='rbf', probability=True, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=50, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
}

# Evaluate each model using cross-validation
results = []
for name, model in models.items():
    pipeline = Pipeline([('scaler', StandardScaler()), ('model', model)])  # Include scaling in pipeline
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    results.append((name, np.mean(scores), np.std(scores)))

# Display results
results_df = pd.DataFrame(results, columns=["Model", "Mean Accuracy", "Std Deviation"])
print(results_df.sort_values(by="Mean Accuracy", ascending=False))

# Optional: Train and test the best model
best_model_name, best_model = max(models.items(), key=lambda x: cross_val_score(Pipeline([('scaler', StandardScaler()), ('model', x[1])]), X_train, y_train, cv=5, scoring='accuracy').mean())
best_model.fit(X_train_scaled, y_train)
y_pred = best_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Best Model: {best_model_name} with Test Accuracy: {test_accuracy:.4f}")
