In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

In [41]:
features_df = pd.read_csv("https://raw.githubusercontent.com/pujan08/ML_Logistic_Regression_For_Credit_Card/main/Credit_card.csv")


In [23]:
#Find missing values and impute them
string_imputer = SimpleImputer(strategy='constant', fill_value='missing')
features_df_imputed = pd.DataFrame(string_imputer.fit_transform(features_df), columns=features_df.columns)
string_imputer.fit(features_df)
missing_values_after_imputation = features_df_imputed.isna().sum().sum()
print(features_df_imputed)

       Ind_ID GENDER Car_Owner Propert_Owner CHILDREN Annual_income  \
0     5008827      M         Y             Y        0      180000.0   
1     5009744      F         Y             N        0      315000.0   
2     5009746      F         Y             N        0      315000.0   
3     5009749      F         Y             N        0       missing   
4     5009752      F         Y             N        0      315000.0   
...       ...    ...       ...           ...      ...           ...   
1543  5028645      F         N             Y        0       missing   
1544  5023655      F         N             N        0      225000.0   
1545  5115992      M         Y             Y        2      180000.0   
1546  5118219      M         Y             N        0      270000.0   
1547  5053790      F         Y             Y        0      225000.0   

               Type_Income                      EDUCATION  \
0                Pensioner               Higher education   
1     Commercial associat

In [24]:
#check and print if there are any missing values
if missing_values_after_imputation == 0:
    print("Sanity Check Passed: No missing values after imputation in features_df.")
else:
    print(f"Sanity Check Failed: There are still {missing_values_after_imputation} missing values in features_df after imputation.")


Sanity Check Passed: No missing values after imputation in features_df.


In [40]:
target_df = pd.read_csv("https://raw.githubusercontent.com/pujan08/ML_Logistic_Regression_For_Credit_Card/main/Credit_card_label.csv")

In [31]:
#Find missing values and impute them
numeric_imputer = SimpleImputer(strategy='mean')
target_df_imputed = pd.DataFrame(numeric_imputer.fit_transform(target_df), columns=target_df.columns)
numeric_imputer.fit(target_df)
print(target_df_imputed)

         Ind_ID  label
0     5008827.0    1.0
1     5009744.0    1.0
2     5009746.0    1.0
3     5009749.0    1.0
4     5009752.0    1.0
...         ...    ...
1543  5028645.0    0.0
1544  5023655.0    0.0
1545  5115992.0    0.0
1546  5118219.0    0.0
1547  5053790.0    0.0

[1548 rows x 2 columns]


In [32]:
#check and print if there are any missing values
missing_values_after_imputation = target_df_imputed.isna().sum().sum()
if missing_values_after_imputation == 0:
    print("Sanity Check Passed: No missing values after imputation in target_df.")
else:
    print(f"Sanity Check Failed: There are still {missing_values_after_imputation} missing values in target_df after imputation.")


Sanity Check Passed: No missing values after imputation in target_df.


In [4]:
merged_df = pd.merge(features_df, target_df, on='Ind_ID', how='inner')

In [8]:
y = merged_df['label']
X = pd.get_dummies(X, columns=['Car_Owner','Propert_Owner','Annual_income','Employed_days','Family_Members'])

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [35]:
model = LogisticRegression()
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['liblinear']
}

In [36]:
# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train_scaled, y_train)


In [37]:
# Display the best parameters and corresponding mean cross-validated score
print("Best Parameters:", grid_search.best_params_)
print("Best Mean Accuracy:", grid_search.best_score_)


Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Mean Accuracy: 0.8966142092203213


In [38]:
# Make predictions on the test data using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)


In [39]:
# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.9096774193548387
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95       280
           1       0.58      0.23      0.33        30

    accuracy                           0.91       310
   macro avg       0.75      0.61      0.64       310
weighted avg       0.89      0.91      0.89       310

