In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [15]:
data = pd.read_csv('scaled_PCA_roberta.csv') 
X = data.iloc[:, :-1].values  
Y = data.iloc[:, -1].values   

In [16]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=42)

In [17]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf']
}

In [21]:
model = RandomForestClassifier(random_state=42)

In [22]:
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid_rf,  # Use param_grid_svm for SVM
    scoring='accuracy',
    cv=5,
    verbose=2,
    n_jobs=-1
)

In [24]:
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END m

In [25]:
best_model = grid_search.best_estimator_

In [26]:
y_pred_grid = best_model.predict(X_test)

In [27]:
y_pred_grid

array([1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 3, 1, 1, 3, 1,
       2, 3, 2, 1, 1, 1, 1, 2, 3, 1, 1, 2, 1, 1, 0, 2, 1, 1, 2, 1, 2, 1,
       1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 2, 0, 1, 1, 1, 3, 1, 3, 2, 1, 2, 2, 3, 2, 2, 2, 1, 1, 1,
       1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 3, 2,
       1, 2, 2, 1, 1, 2, 2, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 3, 1, 1, 3, 1, 2, 2,
       2, 1, 1, 1, 2, 0, 1, 2, 1, 0, 1, 1, 3, 1, 1, 2, 2, 1, 1, 2, 1, 1,
       2, 1, 2, 1])

In [28]:
print(accuracy_score(y_pred_grid,Y_test)*100,"%")

33.33333333333333 %


In [30]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Define the model
model = SVC(kernel='rbf',C=1.0,gamma='scale',random_state=42)

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'C': np.logspace(-3, 3, 7),  # Log-scale range for C
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],  # Values for gamma
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']  # Different kernel types
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, 
                                   n_iter=100, cv=5, verbose=1, random_state=42, n_jobs=-1)

# Fit the model with random search
random_search.fit(X_train, Y_train)

# Print the best parameters and score
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation score: {random_search.best_score_:.4f}")

# Evaluate the model on the test set
y_pre_random_cv = random_search.predict(X_test)
accuracy = accuracy_score(Y_test, y_pre_random_cv)
print(f"Test set accuracy: {accuracy:.4f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found: {'kernel': 'rbf', 'gamma': 'scale', 'C': 1.0}
Best cross-validation score: 0.3333
Test set accuracy: 0.3833
