# Keep-it-dry! LinearSVC v2 Optimization with Optuna
Dataset: ki_ro_ros.csv

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
# from sklearnex import patch_sklearn

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA

from pprint import pprint

## Data Read

In [2]:
df_train = pd.read_csv('./../Preprocessing/kid_train_ki_ro_ros.csv', index_col=0)
df_test = pd.read_csv('./../Preprocessing/kid_test_ki_ro.csv', index_col=0)

In [3]:
df_train.head()

Unnamed: 0,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,-0.866789,7.0,8.0,9,5,0.0,0.0,-0.5,0.191386,0.587838,...,-0.582308,-0.155224,-0.823018,1.817328,-0.416608,-0.826421,-0.996926,-0.798281,0.405153,0.0
1,-0.768774,7.0,8.0,9,5,1.166667,-0.833333,-0.75,0.320974,-0.146396,...,0.783846,1.031983,-0.658824,0.022965,-0.627395,-0.310259,-0.299693,-0.369968,-0.127292,0.0
2,-0.819112,7.0,8.0,9,5,0.833333,-1.166667,-0.25,0.20412,-0.062312,...,0.989231,-0.298507,-0.298414,1.089248,0.777147,1.355888,-0.453893,0.677069,-0.248529,0.0
3,-0.437692,7.0,8.0,9,5,1.0,-1.0,0.0,-0.366667,-0.410661,...,0.801538,0.121677,-0.422506,-0.882568,-0.259759,-0.23906,0.601434,0.327001,0.808704,0.0
4,1.342337,7.0,8.0,9,5,0.333333,-1.0,0.5,1.169663,0.912162,...,-0.84,0.540156,0.37289,0.374217,0.401703,-1.69525,-0.935963,-0.016735,-0.790371,0.0


In [4]:
df_test_id = df_test[['id']]
df_test.drop(columns=['id', 'product_code', 'product_code_F', 'product_code_G', 'product_code_H', 'product_code_I'], inplace=True)
df_test.head()

Unnamed: 0,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,-0.059136,5.0,6.0,6,4,-0.166667,0.166667,0.0,1.138951,-1.168919,...,-0.274792,-0.482308,-0.126795,-0.57954,1.075678,-1.388928,0.417316,-0.634221,0.57033,-0.435202
1,-0.183139,5.0,6.0,6,4,0.666667,0.0,-1.5,0.073783,0.144144,...,0.265708,0.463846,-1.213362,-0.441432,0.397704,1.314407,0.356512,-0.104508,-1.061511,-1.068447
2,-0.210763,5.0,6.0,6,4,0.166667,0.666667,-0.5,0.517228,-0.941441,...,-0.940954,0.241538,0.519687,-0.571355,-0.419624,-1.599006,-0.148114,1.068135,-0.193578,-0.276961
3,-0.199304,5.0,6.0,6,4,0.166667,0.5,1.0,-0.948689,-0.635886,...,-0.052233,0.277692,1.155935,-1.554987,-0.406054,-0.081618,-0.184492,-1.209016,-0.152872,-0.696814
4,1.750358,5.0,6.0,6,4,1.166667,1.333333,0.5,0.017603,0.719219,...,0.093868,0.723846,-0.883582,-0.692583,0.120042,0.322214,0.083671,-0.848361,0.317051,0.644913


In [5]:
# Scale
# Scaled already

In [6]:
X_df = df_train[df_train.columns[:-1]]
y_df = df_train[df_train.columns[-1]]

## PCA

In [7]:
n_components_range = range(10, 23)  # Explore components from 10 to 22

best_n_components = None
best_score = -np.inf  # Initialize with a negative infinity

for n_components in n_components_range:
  # Apply PCA with current n_components
  pca = PCA(n_components=n_components)
  pca_data = pca.fit_transform(X_df)

  # Train linearsvc on transformed data
  model = LinearSVC()
  model.fit(pca_data, y_df)

  # Evaluate performance on testing set (e.g., using F1 score)
  score = model.score(pca.transform(X_df), y_df)
  print(f"{n_components} components score: {score}")

  # Update best component and score if performance improves
  if score > best_score:
    best_n_components = n_components
    best_score = score

print("Best number of components based on LinearSVC performance:", best_n_components)



10 components score: 0.5628077051766168




11 components score: 0.5647196596720998




12 components score: 0.5647913579656804




13 components score: 0.5638353807179389




14 components score: 0.5634529898188423




15 components score: 0.563213995506907




16 components score: 0.5672052005162277




17 components score: 0.5668228096171312




18 components score: 0.5665360164428087




19 components score: 0.5660819272501314




20 components score: 0.5661775249749056




21 components score: 0.566775010754744
22 components score: 0.5650781511400028
Best number of components based on LinearSVC performance: 16




In [7]:
pca = PCA(n_components=16)
X_df_pca = pca.fit_transform(X_df)
df_test_pca = pca.transform(df_test)

## Hyperparameter Tuning using Optuna

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_df_pca, y_df, test_size=.75, random_state=42, stratify=y_df)

In [18]:
# Define model objective function
def objective(trial):
    penalty = trial.suggest_categorical("penalty", ["l2"])
    loss = trial.suggest_categorical("loss", ["squared_hinge", "hinge"])
    C = trial.suggest_loguniform("C", 1e-3, 1e3)

    model = LinearSVC()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    score = f1_score(y_val, y_pred)
    return score

# Create Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_params = study.best_params
best_model = LinearSVC(**best_params)

print("Best parameters: ", best_params)
print("Best model: ", best_model)

[I 2024-05-12 22:35:20,123] A new study created in memory with name: no-name-d9849ce0-9426-4739-8270-acf3f674ccc2
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2024-05-12 22:35:20,564] Trial 0 finished with value: 0.5398517922408879 and parameters: {'penalty': 'l2', 'loss': 'hinge', 'C': 0.0038878058341907922}. Best is trial 0 with value: 0.5398517922408879.
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2024-05-12 22:35:21,041] Trial 1 finished with value: 0.5398517922408879 and parameters: {'penalty': 'l2', 'loss': 'hinge', 'C': 0.011893763808013526}. Best is trial 0 with value: 0.5398517922408879.
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2024-05-12 22:35:21,609] Trial 2 finished with value: 0.5398517922408879 and parameters: {'penalty': 'l2', 'loss': 'squared_hinge', 'C': 22.168525109923703}. Best is trial 0 with value: 0.5398517922408879.
  C = trial.suggest_loguniform("C", 1e-3, 1e3)
[I 2024-05-12 22:35:22,130] Trial 3 finished with value: 0.5398336909871244 and pa

Best parameters:  {'penalty': 'l2', 'loss': 'hinge', 'C': 0.03258763709052359}
Best model:  LinearSVC(C=0.03258763709052359, loss='hinge')


## Prediction

In [19]:
best_model.fit(X_train, y_train)
y_pred = best_model.predict(df_test_pca)
y_pred_df = pd.DataFrame(y_pred, columns=['failure'])



In [20]:
y_submission = pd.concat([df_test_id, y_pred_df], axis=1)

In [21]:
y_submission.head()

Unnamed: 0,id,failure
0,26570,0.0
1,26571,0.0
2,26572,0.0
3,26573,0.0
4,26574,1.0


In [22]:
y_submission.to_csv('./../Submission/kid_submission_linearsvc_v3.csv', index=False)