# Keep-it-dry! LinearSVC
- Training dataset: preprocessedbankchurn_train.csv
- Testing dataset: preprocessedbankchurn_test.csv

In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# from sklearnex import patch_sklearn

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA

from pprint import pprint

## Data Read

In [30]:
# Load data
X_df = pd.read_csv('preprocessedbankchurn_train.csv')
X_df.drop(["Exited", "id", "CustomerId"], axis=1, inplace=True)
y_df = pd.read_csv('preprocessedbankchurn_train.csv')['Exited']
X_submission = pd.read_csv('preprocessedbankchurn_test.csv')
X_submission_id = X_submission['id'].copy()
X_submission.drop(['id', 'CustomerId'], axis=1, inplace=True)

In [35]:
X_df.shape

(165034, 15)

In [46]:
# Scale
scale = MinMaxScaler()
X_df = scale.fit_transform(X_df)
X_df = pd.DataFrame(X_df, columns=X_submission.columns)

In [47]:
X_df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,France,Germany,Spain,Female,Male,ExitedfamilySize,NotExitedfamilySize
0,0.636,0.202703,0.3,0.0,0.333333,1.0,0.0,0.907279,1.0,0.0,0.0,0.0,1.0,0.318008,0.681992
1,0.554,0.202703,0.1,0.0,0.333333,1.0,1.0,0.247483,1.0,0.0,0.0,0.0,1.0,0.326667,0.673333
2,0.656,0.297297,1.0,0.0,0.333333,1.0,0.0,0.924364,1.0,0.0,0.0,0.0,1.0,0.222052,0.777948
3,0.462,0.216216,0.2,0.593398,0.0,1.0,1.0,0.422787,1.0,0.0,0.0,0.0,1.0,0.181991,0.818009
4,0.732,0.202703,0.5,0.0,0.333333,1.0,1.0,0.075293,0.0,0.0,1.0,0.0,1.0,0.146341,0.853659


In [48]:
X_submission.columns

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'France', 'Germany', 'Spain',
       'Female', 'Male', 'ExitedfamilySize', 'NotExitedfamilySize'],
      dtype='object')

## PCA

In [50]:
n_components_range = range(10, 15)  # Explore components from 10 to 15

best_n_components = None
best_score = -np.inf  # Initialize with a negative infinity

for n_components in n_components_range:
  # Apply PCA with current n_components
  pca = PCA(n_components=n_components)
  pca_data = pca.fit_transform(X_df)

  # Train linearsvc on transformed data
  model = LinearSVC()
  model.fit(pca_data, y_df)

  # Evaluate performance on testing set (e.g., using F1 score)
  score = model.score(pca.transform(X_df), y_df)
  print(f"{n_components} components score: {score}")

  # Update best component and score if performance improves
  if score > best_score:
    best_n_components = n_components
    best_score = score

print("Best number of components based on LinearSVC performance:", best_n_components)

10 components score: 0.8107783850600482
11 components score: 0.8211277676115225
12 components score: 0.8370335809590751
13 components score: 0.8370335809590751
14 components score: 0.8370335809590751
Best number of components based on LinearSVC performance: 12


In [52]:
pca = PCA(n_components=13)
X_df_pca = pca.fit_transform(X_df)
X_submission_pca = pca.transform(X_submission)

## Hyperparameter Tuning using GridSearchCV

In [53]:
X_train, X_val, y_train, y_val = train_test_split(X_df_pca, y_df, test_size=.75, random_state=42, stratify=y_df)

In [59]:
# Define model
model = LinearSVC(max_iter=1000) 

# Define hyperparameter grid
param_grid = {
    'penalty':['l1', 'l2'],
    'loss':['squared_hinge', 'hinge'],
    'C': np.logspace(-2, 2, 10),  # Search for different regularization strengths
}

# Define GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='roc_auc') 

In [60]:
# GridSearchCV fitting
grid_search.fit(X_train, y_train)

200 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\RAYHAN EGAR\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\RAYHAN EGAR\anaconda3\Lib\site-packages\sklearn\svm\_classes.py", line 274, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
                                           ^^^^^^^^^^^^^^^
  File "c:\Users\RAYHAN EGAR\anaconda3\Lib\site-packages\sklearn\svm\_base.py", line 1223, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
 

In [61]:
svc_best = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best roc_auc score: {grid_search.best_score_}")

Best parameters: {'C': 4.6415888336127775, 'loss': 'squared_hinge', 'penalty': 'l2'}
Best roc_auc score: 0.8359092629754322


## Prediction

In [65]:
y_submission = svc_best.predict(X_submission_pca)
print(y_submission)

# Prepare submission dictionary
submission_dict = {'id': X_submission_id, 'Exited': y_submission}
submission_df = pd.DataFrame(submission_dict)
submission_df

[1 1 1 ... 1 1 0]


Unnamed: 0,id,Exited
0,165034,1
1,165035,1
2,165036,1
3,165037,1
4,165038,0
...,...,...
110018,275052,0
110019,275053,0
110020,275054,1
110021,275055,1


In [66]:
submission_df.head()

Unnamed: 0,id,Exited
0,165034,1
1,165035,1
2,165036,1
3,165037,1
4,165038,0


In [67]:
submission_df.to_csv('bankchurn_submission_linearsvc.csv', index=False)