# Keep-it-dry! GaussianNB
Dataset: ki_ro_ros.csv.
Introducing the use of PCA with multiple n_components

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB

from pprint import pprint

## Data Read

In [2]:
df_train = pd.read_csv('./../Preprocessing/kid_train_ki_ro_ros.csv', index_col=0)
df_test = pd.read_csv('./../Preprocessing/kid_test_ki_ro.csv', index_col=0)

In [3]:
df_test_id = df_test[['id']]

In [4]:
df_test.drop(columns=['id', 'product_code', 'product_code_F', 'product_code_G', 'product_code_H', 'product_code_I'], inplace=True)

In [5]:
# Scale
# Scaled already

In [6]:
X_df = df_train[df_train.columns[:-1]]
y_df = df_train[df_train.columns[-1]]

## PCA

In [17]:
n_components_range = range(10, 23)  # Explore components from 10 to 22

best_n_components = None
best_score = -np.inf  # Initialize with a negative infinity

for n_components in n_components_range:
  # Apply PCA with current n_components
  pca = PCA(n_components=n_components)
  pca_data = pca.fit_transform(X_df)

  # Train RidgeClassifier on transformed data
  model = GaussianNB(var_smoothing=0.001)
  model.fit(pca_data, y_df)

  # Evaluate performance on testing set (e.g., using F1 score)
  score = model.score(pca.transform(X_df), y_df)
  print(f"{n_components} components score: {score}")

  # Update best component and score if performance improves
  if score > best_score:
    best_n_components = n_components
    best_score = score

print("Best number of components based on GaussianNB performance:", best_n_components)

10 components score: 0.5602265666077147
11 components score: 0.5581951149562641
12 components score: 0.5588881984608767
13 components score: 0.560107069451747
14 components score: 0.5597724774150376
15 components score: 0.560107069451747
16 components score: 0.5605611586444242
17 components score: 0.5626165097270684
18 components score: 0.5642655704794226
19 components score: 0.5631422972133263
20 components score: 0.564504564791358
21 components score: 0.5635963864060035
22 components score: 0.5605850580756178
Best number of components based on GaussianNB performance: 20


In [18]:
pca = PCA(n_components=20)
X_df_pca = pca.fit_transform(X_df)
df_test_pca = pca.transform(df_test)

## Hyperparameter Tuning using GridSearchCV

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X_df_pca, y_df, test_size=.2, random_state=42, stratify=y_df)

In [20]:
# Define model
model = GaussianNB()

# Define hyperparameter grid
param_grid = {
    'var_smoothing': np.logspace(-9, -3, 5)  # Explore different smoothing factors
}

# Define GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1')  # Use F1 score

In [21]:
# GridSearchCV fitting
grid_search.fit(X_train, y_train)

In [22]:
rc_best = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best f1-score: {grid_search.best_score_}")

Best parameters: {'var_smoothing': 0.001}
Best f1-score: 0.5047705553049663


## Prediction

In [23]:
y_pred = rc_best.predict(df_test_pca)
y_pred_df = pd.DataFrame(y_pred, columns=['failure'])

In [24]:
y_submission = pd.concat([df_test_id, y_pred_df], axis=1)

In [25]:
y_submission.head()

Unnamed: 0,id,failure
0,26570,0.0
1,26571,0.0
2,26572,0.0
3,26573,0.0
4,26574,1.0


In [26]:
y_submission.to_csv('./../Submission/kid_submission_gaussiannb_pca_v2.csv', index=False)