In [6]:
import os
import pandas as pd
import numpy as np

In [8]:
import os
import pandas as pd

# Move up one level from 'scripts/' and access 'data/'
file_path = os.path.join("..", "data", "AER_credit_card_data.csv")

df = pd.read_csv(file_path)

### Why logistic regression

Used for binary classification (e.g., default vs. no default). 

Predicts a class (0 or 1) based on a probability threshold (e.g., 0.5).

Can interpret feature importance of the coefficients supplied

In [9]:
## Clean the data

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Convert categorical values ('yes' → 1, 'no' → 0)
df['card'] = df['card'].map({'yes': 1, 'no': 0})
df['owner'] = df['owner'].map({'yes': 1, 'no': 0})
df['selfemp'] = df['selfemp'].map({'yes': 1, 'no': 0})

# Create the 'negative_reports' column
df['negative_reports'] = np.where(df['reports'] >= 1, 1, 0)

# Drop the original 'reports' column
df = df.drop(columns=['reports'])


# Identify binary categorical variables
binary_columns = ['owner', 'selfemp', 'majorcards', 'negative_reports']

# Identify continuous variables (all others)
continuous_columns = [col for col in X.columns if col not in binary_columns]

# Initialize the scaler
scaler = StandardScaler()

# Scale only continuous variables
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[continuous_columns] = scaler.fit_transform(X_train[continuous_columns])
X_test_scaled[continuous_columns] = scaler.transform(X_test[continuous_columns])

# Convert to DataFrame for easier interpretation
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)

print(X_train_scaled_df.head())



           age    income     share  expenditure  owner  selfemp  dependents  \
598   1.045537  0.607896 -0.260383    -0.093756      1        0    0.777563   
1213 -0.493522  1.273808 -0.417862    -0.197316      1        0    1.569386   
209  -0.237011  0.264575 -0.383169    -0.311005      0        0   -0.806084   
538   0.408400 -1.016606 -0.714854    -0.700058      1        0    1.569386   
140   0.333930 -0.218889  0.456763     0.367486      0        0    0.777563   

        months  majorcards    active  negative_reports  
598   0.825741           1  0.500268                 0  
1213 -0.562893           1 -1.109315                 0  
209  -0.442142           0 -0.948357                 0  
538   0.976679           0  0.178351                 0  
140  -0.593081           1 -0.143566                 0  


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print(f"Model Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_rep)


Model Accuracy: 0.95
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.89      0.90        62
           1       0.97      0.98      0.97       202

    accuracy                           0.95       264
   macro avg       0.94      0.93      0.94       264
weighted avg       0.95      0.95      0.95       264



In [16]:
print(y_train.value_counts())
print(y_test.value_counts())


card
1    821
0    234
Name: count, dtype: int64
card
1    202
0     62
Name: count, dtype: int64


In [17]:
# Get model coefficients
coefficients = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_[0]
}).sort_values(by="Coefficient", ascending=False)

# Display feature importance
print(coefficients)


             Feature  Coefficient
2              share     5.937128
3        expenditure     5.417918
8         majorcards     0.528927
9             active     0.478437
4              owner     0.298819
1             income     0.091953
0                age     0.087303
5            selfemp     0.071127
7             months    -0.039601
6         dependents    -0.170967
10  negative_reports    -2.250995
