In [241]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import warnings

## Data gathering

In [242]:
# Data gathering
df_label = pd.read_csv('Credit_card_label.csv')
df_features = pd.read_csv('Credit_card.csv')
df = pd.merge(df_label, df_features, on='Ind_ID')

df.shape

(1548, 19)

## Data cleaning

### Drop row with missing data

In [243]:
print('Before')
print(df.isnull().sum())
df = df.dropna()
print('\nAfter')
print(df.isnull().sum())

Before
Ind_ID               0
label                0
GENDER               7
Car_Owner            0
Propert_Owner        0
CHILDREN             0
Annual_income       23
Type_Income          0
EDUCATION            0
Marital_status       0
Housing_type         0
Birthday_count      22
Employed_days        0
Mobile_phone         0
Work_Phone           0
Phone                0
EMAIL_ID             0
Type_Occupation    488
Family_Members       0
dtype: int64

After
Ind_ID             0
label              0
GENDER             0
Car_Owner          0
Propert_Owner      0
CHILDREN           0
Annual_income      0
Type_Income        0
EDUCATION          0
Marital_status     0
Housing_type       0
Birthday_count     0
Employed_days      0
Mobile_phone       0
Work_Phone         0
Phone              0
EMAIL_ID           0
Type_Occupation    0
Family_Members     0
dtype: int64


### Drop unrelated features

In [244]:
df = df.drop(['Ind_ID', 'Mobile_phone', 'Work_Phone', 'Phone', 'EMAIL_ID'], axis=1)
df.columns

Index(['label', 'GENDER', 'Car_Owner', 'Propert_Owner', 'CHILDREN',
       'Annual_income', 'Type_Income', 'EDUCATION', 'Marital_status',
       'Housing_type', 'Birthday_count', 'Employed_days', 'Type_Occupation',
       'Family_Members'],
      dtype='object')

### Drop duplicated feature values

In [245]:
print(f'Duplicates before : {df.duplicated().sum()}')
df = df.drop_duplicates().reset_index().drop('index', axis=1)
print(f'Duplicates after : {df.duplicated().sum()}')
df.shape

Duplicates before : 112
Duplicates after : 0


(913, 14)

## Preprocessing

### Encode non-numeric values

In [246]:
non_numeric_col = list((set(df.columns)-set(df.describe().columns)))
non_numeric_col

['Car_Owner',
 'Type_Income',
 'Housing_type',
 'Propert_Owner',
 'EDUCATION',
 'Type_Occupation',
 'Marital_status',
 'GENDER']

In [247]:
le = LabelEncoder()
for x in df[non_numeric_col]:
    df[x] = le.fit_transform(df[x])
    print(dict(zip(le.classes_, le.transform(le.classes_))))

{'N': 0, 'Y': 1}
{'Commercial associate': 0, 'Pensioner': 1, 'State servant': 2, 'Working': 3}
{'Co-op apartment': 0, 'House / apartment': 1, 'Municipal apartment': 2, 'Office apartment': 3, 'Rented apartment': 4, 'With parents': 5}
{'N': 0, 'Y': 1}
{'Higher education': 0, 'Incomplete higher': 1, 'Lower secondary': 2, 'Secondary / secondary special': 3}
{'Accountants': 0, 'Cleaning staff': 1, 'Cooking staff': 2, 'Core staff': 3, 'Drivers': 4, 'HR staff': 5, 'High skill tech staff': 6, 'IT staff': 7, 'Laborers': 8, 'Low-skill Laborers': 9, 'Managers': 10, 'Medicine staff': 11, 'Private service staff': 12, 'Realty agents': 13, 'Sales staff': 14, 'Secretaries': 15, 'Security staff': 16, 'Waiters/barmen staff': 17}
{'Civil marriage': 0, 'Married': 1, 'Separated': 2, 'Single / not married': 3, 'Widow': 4}
{'F': 0, 'M': 1}


### Split the dataset for training and testing

In [248]:
y = df.pop('label')
X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Training Model

### Train the SGDClassifier with default params 

In [249]:
# Create and train model
model = SGDClassifier(random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Importance of each feature
coefficients = model.coef_
feature_names = X_train.columns
for feature, importance in zip(feature_names, coefficients[0]):
    print(f"{feature}: {importance:.4f}")

Accuracy: 0.89

Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94       245
           1       0.00      0.00      0.00        29

    accuracy                           0.89       274
   macro avg       0.45      0.50      0.47       274
weighted avg       0.80      0.89      0.84       274

GENDER: 55.3221
Car_Owner: 9.6993
Propert_Owner: -6.8254
CHILDREN: -18.6802
Annual_income: -57032.0078
Type_Income: -523.4041
EDUCATION: -54.9628
Marital_status: 134.7128
Housing_type: -149.4414
Birthday_count: -6207.9247
Employed_days: 443165.9302
Type_Occupation: -929.3386
Family_Members: -141.5382


In [250]:
warnings.filterwarnings('ignore')

# Create and train model
model = SGDClassifier(random_state=42)

# Create a GridSearchCV object
params = {
    'loss': ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber',
             'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2', 'l1', 'elasticnet', None],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'shuffle': [True, False],
}

grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameters and corresponding accuracy
print('Best parameters: ', grid_search.best_params_)
print('Best accuracy: {:.2f}'.format(grid_search.best_score_))

Best parameters:  {'alpha': 0.001, 'learning_rate': 'optimal', 'loss': 'hinge', 'penalty': 'elasticnet', 'shuffle': True}
Best accuracy: 0.92


In [251]:
# # Evaluate the accuracy with the GridSearchCV best model
model_best = grid_search.best_estimator_
y_pred = model_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Importance of each feature
coefficients = model_best.coef_
feature_names = X_train.columns
for feature, importance in zip(feature_names, coefficients[0]):
    print(f"{feature}: {importance:.4f}")

Accuracy: 0.89

Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94       245
           1       0.00      0.00      0.00        29

    accuracy                           0.89       274
   macro avg       0.45      0.50      0.47       274
weighted avg       0.80      0.89      0.84       274

GENDER: 6.1813
Car_Owner: 0.6041
Propert_Owner: 0.0000
CHILDREN: -2.0477
Annual_income: -1508.9270
Type_Income: -59.1983
EDUCATION: -4.5223
Marital_status: 17.3878
Housing_type: -21.5410
Birthday_count: -4074.7790
Employed_days: 46690.6630
Type_Occupation: -113.5839
Family_Members: -16.2141


### Q5 Use GridSearchCV to tune the parameter of each of the above models. Can you obtain better results in this step for any of the models? Discuss your observations.

No. The testing accuracy given by the models trained with default parameters and GridSearchCV best parameters are almost the same in this dataset.

The GridSearchCV is suggesting below parameters:
```python
{
    'alpha': 0.001,
    'learning_rate': 'optimal',
    'loss': 'hinge',
    'penalty': 'elasticnet',
    'shuffle': True
}
```

From the (SGDClassifier document)[https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html], we can see that the default parameters are:

```python
{
    'alpha': 0.0001,
    'learning_rate': 'optimal',
    'loss': 'hinge',
    'penalty': 'l2',
    'shuffle': True
}
```

For SGDClassifier, `learning_rate` and `loss` are 2 important factors, since they are the same in the 2 models, it is not suprise they are providing similar accuracy.



### Q6 Randomly (or based on certain hypothesis) remove some features and re-evaluate the models. Document your observations with respect to models performances.


From the coefficients report, we found that `Employed_days`, `Birthday_count` and `Annual_income` are having the highest values. I am going to drop these features to see what will happen.

In [252]:
drop_features = ['Annual_income', 'Type_Occupation', 'Employed_days', 'Propert_Owner', 'Type_Income', 'GENDER', 'Birthday_count']
X_train_drop = X_train.drop(drop_features, axis=1)
X_test_drop = X_test.drop(drop_features, axis=1)
X_train_drop.columns

Index(['Car_Owner', 'CHILDREN', 'EDUCATION', 'Marital_status', 'Housing_type',
       'Family_Members'],
      dtype='object')

In [253]:
model = SGDClassifier(random_state=42)
grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy')
grid_search.fit(X_train_drop, y_train)

# Print the best parameters and corresponding accuracy
print('Best parameters: ', grid_search.best_params_)
print('Best accuracy: {:.2f}'.format(grid_search.best_score_))

Best parameters:  {'alpha': 0.0001, 'learning_rate': 'optimal', 'loss': 'modified_huber', 'penalty': 'l1', 'shuffle': True}
Best accuracy: 0.92


In [257]:
# # Evaluate the accuracy with the GridSearchCV best model
model_drop_best = grid_search.best_estimator_
y_pred = model_drop_best.predict(X_test_drop)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Importance of each feature
coefficients = model_drop_best.coef_
feature_names = X_train_drop.columns
for feature, importance in zip(feature_names, coefficients[0]):
    print(f"{feature}: {importance:.4f}")



Accuracy: 0.91

Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95       245
           1       1.00      0.10      0.19        29

    accuracy                           0.91       274
   macro avg       0.95      0.55      0.57       274
weighted avg       0.91      0.91      0.87       274

Car_Owner: 0.6935
CHILDREN: -29.8752
EDUCATION: -1.0647
Marital_status: 10.9543
Housing_type: 0.0000
Family_Members: 28.5413


Surprisingly, the test accuracy of the new model improved from 0.89 to 0.91.