k-folds Cross Validation (beyond confusion matrix, another method for evaluating the model)

In [3]:
# We will use the SVM model we created in another file

from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

df = pd.read_csv(r'C:\Users\Stelios_Ntanavaras\Documents\Python Scripts\Machine-Learning-A-Z-AI-Python-R-ChatGPT-Prize-2024-\Machine Learning\Classification\Social_Network_Ads.csv')
df.head()

# Split the dataset into X and y
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Splitting the dataset into Training and Testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train) # '.fit' will find the mean and std of all values, and '.transform' will apply the formula
X_test = sc.fit_transform(X_test)

# Train the model
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

In [5]:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(accuracies)

# If we are working with a big dataset, we can add another argument called 'n_jobs' and set it into -1, 
# which means we are going to use all CPUs

[0.76666667 0.8        0.73333333 0.83333333 0.73333333 0.66666667
 0.83333333 0.93333333 0.96666667 0.86666667]


In [7]:
# We take the avg of all these accuracies

avg = accuracies.mean()
print('The average of all accuracies is:', avg)

The average of all accuracies is: 0.8133333333333335


Hyperparameter Tuning

In [8]:
# Grid Search

from sklearn.model_selection import GridSearchCV

# Hyperparameters
parameters = [{'C': [0.25, 0.5, 0.75, 1], 'kernel': ['linear']},
              {'C': [0.25, 0.5, 0.75, 1], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]

# Our Grid Search object
grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)

# Multiple training (all combinations)
grid_search.fit(X_train, y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print('Best Accuracy: {:.2f} %'.format(best_accuracy*100))
print('Best Parameters:', best_parameters)

Best Accuracy: 90.67 %
Best Parameters: {'C': 0.5, 'gamma': 0.6, 'kernel': 'rbf'}
