# Keep-it-dry! LogisticRegression
Dataset: ki_ro_ros.csv with PCA

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from pprint import pprint

## Data Read

In [2]:
df_train = pd.read_csv('./../Preprocessing/kid_train_ki_ro_ros.csv', index_col=0)
df_test = pd.read_csv('./../Preprocessing/kid_test_ki_ro.csv', index_col=0)

In [3]:
df_train.head()

Unnamed: 0,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,-0.866789,7.0,8.0,9,5,0.0,0.0,-0.5,0.191386,0.587838,...,-0.582308,-0.155224,-0.823018,1.817328,-0.416608,-0.826421,-0.996926,-0.798281,0.405153,0.0
1,-0.768774,7.0,8.0,9,5,1.166667,-0.833333,-0.75,0.320974,-0.146396,...,0.783846,1.031983,-0.658824,0.022965,-0.627395,-0.310259,-0.299693,-0.369968,-0.127292,0.0
2,-0.819112,7.0,8.0,9,5,0.833333,-1.166667,-0.25,0.20412,-0.062312,...,0.989231,-0.298507,-0.298414,1.089248,0.777147,1.355888,-0.453893,0.677069,-0.248529,0.0
3,-0.437692,7.0,8.0,9,5,1.0,-1.0,0.0,-0.366667,-0.410661,...,0.801538,0.121677,-0.422506,-0.882568,-0.259759,-0.23906,0.601434,0.327001,0.808704,0.0
4,1.342337,7.0,8.0,9,5,0.333333,-1.0,0.5,1.169663,0.912162,...,-0.84,0.540156,0.37289,0.374217,0.401703,-1.69525,-0.935963,-0.016735,-0.790371,0.0


In [4]:
df_test_id = df_test[['id']]
df_test.drop(columns=['id', 'product_code', 'product_code_F', 'product_code_G', 'product_code_H', 'product_code_I'], inplace=True)
df_test.head()

Unnamed: 0,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,-0.059136,5.0,6.0,6,4,-0.166667,0.166667,0.0,1.138951,-1.168919,...,-0.274792,-0.482308,-0.126795,-0.57954,1.075678,-1.388928,0.417316,-0.634221,0.57033,-0.435202
1,-0.183139,5.0,6.0,6,4,0.666667,0.0,-1.5,0.073783,0.144144,...,0.265708,0.463846,-1.213362,-0.441432,0.397704,1.314407,0.356512,-0.104508,-1.061511,-1.068447
2,-0.210763,5.0,6.0,6,4,0.166667,0.666667,-0.5,0.517228,-0.941441,...,-0.940954,0.241538,0.519687,-0.571355,-0.419624,-1.599006,-0.148114,1.068135,-0.193578,-0.276961
3,-0.199304,5.0,6.0,6,4,0.166667,0.5,1.0,-0.948689,-0.635886,...,-0.052233,0.277692,1.155935,-1.554987,-0.406054,-0.081618,-0.184492,-1.209016,-0.152872,-0.696814
4,1.750358,5.0,6.0,6,4,1.166667,1.333333,0.5,0.017603,0.719219,...,0.093868,0.723846,-0.883582,-0.692583,0.120042,0.322214,0.083671,-0.848361,0.317051,0.644913


In [6]:
X_df = df_train[df_train.columns[:-1]]
y_df = df_train[df_train.columns[-1]]

## PCA

In [7]:
n_components_range = range(10, 23)  # Explore components from 10 to 22

best_n_components = None
best_score = -np.inf  # Initialize with a negative infinity

for n_components in n_components_range:
  # Apply PCA with current n_components
  pca = PCA(n_components=n_components)
  pca_data = pca.fit_transform(X_df)

  # Train LogisticRegression on transformed data
  model = LogisticRegression()
  model.fit(pca_data, y_df)

  # Evaluate performance on testing set (e.g., using F1 score)
  score = model.score(pca.transform(X_df), y_df)
  print(f"{n_components} components score: {score}")

  # Update best component and score if performance improves
  if score > best_score:
    best_n_components = n_components
    best_score = score

print("Best number of components based on LogisticRegression performance:", best_n_components)

10 components score: 0.5630705989197458
11 components score: 0.5646001625161321
12 components score: 0.5645523636537451
13 components score: 0.563620285837197
14 components score: 0.5632378949381005
15 components score: 0.5640743750298743
16 components score: 0.5672290999474212
17 components score: 0.567037904497873
18 components score: 0.567037904497873
19 components score: 0.5656039386262607
20 components score: 0.5663687204244539
21 components score: 0.5667033124611635
22 components score: 0.565795134075809
Best number of components based on LogisticRegression performance: 16


In [8]:
pca = PCA(n_components=16)
X_df_pca = pca.fit_transform(X_df)
df_test_pca = pca.transform(df_test)

## Hyperparameter Tuning using GridSearchCV

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X_df_pca, y_df, test_size=.75, random_state=42, stratify=y_df)

In [15]:
# Define model
model = LogisticRegression(max_iter=1000)

# Define hyperparameter grid
param_grid = {
    'solver': ['liblinear', 'lbfgs', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'penalty': ['l2', 'elasticnet'],
    'C': np.logspace(-4, 4, 5)  # Search across different C values in log space
}

# Define GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1')  # Use F1 score for imbalanced data

In [16]:
# GridSearchCV fitting
grid_search.fit(X_train, y_train)

150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\RAYHAN EGAR\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\RAYHAN EGAR\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\RAYHAN EGAR\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 64, in _check_solver
    raise ValueError(
ValueError: Only 'saga' so

In [17]:
logreg_best = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best f1-score: {grid_search.best_score_}")

Best parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
Best f1-score: 0.5409015493720147


## Prediction

In [18]:
y_pred = logreg_best.predict(df_test_pca)
y_pred_df = pd.DataFrame(y_pred, columns=['failure'])

In [19]:
y_submission = pd.concat([df_test_id, y_pred_df], axis=1)

In [20]:
y_submission.head()

Unnamed: 0,id,failure
0,26570,0.0
1,26571,0.0
2,26572,0.0
3,26573,0.0
4,26574,1.0


In [21]:
y_submission.to_csv('./../Submission/kid_submission_logreg_pca.csv', index=False)