# Keep-it-dry! RidgeClassifier v2
Dataset: ki_ro_ros.csv.
Introducing the use of PCA with multiple n_components

In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import RidgeClassifier

from pprint import pprint

## Data Read

In [22]:
df_train = pd.read_csv('./../Preprocessing/kid_train_ki_ro_ros.csv', index_col=0)
df_test = pd.read_csv('./../Preprocessing/kid_test_ki_ro.csv', index_col=0)

In [23]:
df_test_id = df_test[['id']]

In [24]:
df_test.drop(columns=['id', 'product_code', 'product_code_F', 'product_code_G', 'product_code_H', 'product_code_I'], inplace=True)

In [14]:
# Scale
# Scaled already

In [26]:
X_df = df_train[df_train.columns[:-1]]
y_df = df_train[df_train.columns[-1]]

## PCA

In [29]:
n_components_range = range(10, 23)  # Explore components from 10 to 22

best_n_components = None
best_score = -np.inf  # Initialize with a negative infinity

for n_components in n_components_range:
  # Apply PCA with current n_components
  pca = PCA(n_components=n_components)
  pca_data = pca.fit_transform(X_df)

  # Train RidgeClassifier on transformed data
  model = RidgeClassifier()
  model.fit(pca_data, y_df)

  # Evaluate performance on testing set (e.g., using F1 score)
  score = model.score(pca.transform(X_df), y_df)
  print(f"{n_components} components score: {score}")

  # Update best component and score if performance improves
  if score > best_score:
    best_n_components = n_components
    best_score = score

print("Best number of components based on RidgeClassifier performance:", best_n_components)

10 components score: 0.5624253142775202
11 components score: 0.5644328664977774
12 components score: 0.5647913579656804
13 components score: 0.5637158835619712
14 components score: 0.5634529898188423
15 components score: 0.5631900960757134
16 components score: 0.5672052005162277
17 components score: 0.5668467090483247
18 components score: 0.5665121170116151
19 components score: 0.566105826681325
20 components score: 0.5663687204244539
21 components score: 0.566727211892357
22 components score: 0.5650064528464223
Best number of components based on RidgeClassifier performance: 16


In [33]:
pca = PCA(n_components=16)
X_df_pca = pca.fit_transform(X_df)
df_test_pca = pca.transform(df_test)

## Hyperparameter Tuning using GridSearchCV

In [48]:
X_train, X_val, y_train, y_val = train_test_split(X_df_pca, y_df, test_size=.2, random_state=42, stratify=y_df)

In [49]:
# Define model
model = RidgeClassifier() 

# Define hyperparameter grid
param_grid = {
    'alpha': np.logspace(-2, 2, 5),  # Search across different alpha values in log space
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
}

# Define GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='f1')  # Use F1 score for imbalanced data


In [50]:
# GridSearchCV fitting
grid_search.fit(X_train, y_train)

50 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\RAYHAN EGAR\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\RAYHAN EGAR\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py", line 1424, in fit
    super().fit(X, Y, sample_weight=sample_weight)
  File "c:\Users\RAYHAN EGAR\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py", line 825, in fit
    raise ValueError(
ValueError: 'lbfgs' solver can be used only when positive=True. Please use another solver.

 0.53885313        nan 0.5389

In [51]:
rc_best = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best f1-score: {grid_search.best_score_}")

Best parameters: {'alpha': 1.0, 'solver': 'lsqr'}
Best f1-score: 0.539191372954025


## Prediction

In [39]:
y_pred = rc_best.predict(df_test_pca)
y_pred_df = pd.DataFrame(y_pred, columns=['failure'])

In [41]:
y_submission = pd.concat([df_test_id, y_pred_df], axis=1)

In [42]:
y_submission.head()

Unnamed: 0,id,failure
0,26570,0.0
1,26571,0.0
2,26572,0.0
3,26573,0.0
4,26574,1.0


In [43]:
y_submission.to_csv('./../Submission/kid_submission_rc_pca.csv', index=False)