## Load Necessary Libraries

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve

## Read Data

In [32]:
df = pd.read_stata('lendingclub_train.dta')
# df.head()
# df.describe()
columns_to_drop = [
    "index",
    "depvar",
    "total_acc", 
    "out_prncp", 
    "out_prncp_inv", 
    "total_pymnt", 
    "total_pymnt_inv", 
    "total_rec_prncp", 
    "total_rec_int", 
    "total_rec_late_fee", 
    "recoveries", 
    "collection_recovery_fee",
    "last_pymnt_amnt", 
    # "last_fico_range_high", 
    # "last_fico_range_low", 
    # "tot_coll_amt", 
    # "tot_cur_bal", 
    "initial_list_status1", 
    "initial_list_status2", 
    "elapsed_t",
    "purpose1",
    "addr_state1",
    "elapsed_t",
    "debt_settlement_flag1",
    "term1",
    "mths_since_last_delinq1",
    "mths_since_last_major_derog1",
    "mths_since_last_record1",
    "mths_since_rcnt_il1",
    "mths_since_recent_bc1",
    "mths_since_recent_bc_dlq1",
    "mths_since_recent_inq1",
    "mths_since_recent_revol_delinq1"
]

issue_d_count = 1
while(issue_d_count <= 118):
    word_tmp = "issue_d" + str(issue_d_count)
    columns_to_drop.append(word_tmp)
    issue_d_count += 1


In [33]:
y = df['depvar']
X = df.drop(columns=columns_to_drop)
# X = df.drop(columns=['depvar','index'])

X.info(verbose=True,max_cols=None)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 874335 entries, 0 to 874334
Data columns (total 188 columns):
 #    Column                            Dtype  
---   ------                            -----  
 0    loan_amnt                         int32  
 1    funded_amnt                       int32  
 2    funded_amnt_inv                   float64
 3    int_rate                          float64
 4    installment                       float64
 5    annual_inc                        float64
 6    dti                               float64
 7    delinq_2yrs                       int8   
 8    fico_range_low                    int16  
 9    fico_range_high                   int16  
 10   inq_last_6mths                    int8   
 11   open_acc                          int16  
 12   pub_rec                           int8   
 13   revol_bal                         int32  
 14   revol_util                        float64
 15   last_fico_range_high              int16  
 16   last_fico_range_lo

In [34]:
# Separate majority and minority classes
df_majority = df[df['depvar'] == 0]
df_minority = df[df['depvar'] == 1]

# Count number of instances in the minority class
minority_count = len(df_minority)

# Undersample the majority class
df_majority_undersampled = df_majority.sample(n=minority_count, random_state=42)

# Combine minority class with the undersampled majority class
df_undersampled = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the dataset
df_undersampled = df_undersampled.sample(frac=1, random_state=42)

# Now split into features and target, and then into training and testing sets
y = df_undersampled['depvar']
X = df_undersampled.drop(columns=columns_to_drop)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# X.info(verbose=True,max_cols=None)
# y.head()
X.head()


Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,fico_range_low,fico_range_high,...,mths_since_recent_revol_delinq2,mths_since_recent_revol_delinq3,mths_since_recent_revol_delinq4,mths_since_recent_revol_delinq5,mths_since_recent_revol_delinq6,mths_since_recent_revol_delinq7,mths_since_recent_revol_delinq8,mths_since_recent_revol_delinq9,mths_since_recent_revol_delinq10,mths_since_recent_revol_delinq11
865911,21000,21000,21000.0,0.1144,461.22,61000.0,20.11,0,690,694,...,0,0,0,0,0,0,0,0,0,0
630306,1375,1375,1375.0,0.0649,42.14,35000.0,25.48,0,750,754,...,0,0,0,0,0,0,0,0,0,0
346254,9000,9000,9000.0,0.1212,299.45,33280.0,10.06,0,710,714,...,0,0,0,0,0,0,1,0,0,0
385471,2000,2000,2000.0,0.1699,71.3,60000.0,20.32,1,670,674,...,0,0,0,0,1,0,0,0,0,0
78336,11200,11200,11200.0,0.1499,388.2,32000.0,23.74,0,660,664,...,0,0,0,0,0,0,0,0,0,0


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Applying LASSO Regression

In [15]:
# Create a LASSO (L1) logistic regression model
lasso = LogisticRegression(penalty='l1', solver='saga', max_iter=100, random_state=42)

# Fit the model on the training data
lasso.fit(X_train, y_train)



In [16]:
# Predict on the test set
y_pred = lasso.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.5682323464022303
              precision    recall  f1-score   support

         0.0       0.58      0.55      0.56     28614
         1.0       0.56      0.59      0.57     28060

    accuracy                           0.57     56674
   macro avg       0.57      0.57      0.57     56674
weighted avg       0.57      0.57      0.57     56674



In [26]:
# Get the feature names
feature_names = X.columns

# Extract coefficients
coefficients = lasso.coef_[0]

# Create a DataFrame of features and their coefficients
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Filter out features with zero coefficients
selected_features = feature_importance[abs(feature_importance['Coefficient']) < 0.00001]
print(selected_features)


                              Feature   Coefficient
0                           loan_amnt  9.226259e-06
1                         funded_amnt  9.218330e-06
2                     funded_amnt_inv  9.182219e-06
3                            int_rate  3.287661e-09
4                         installment  4.974394e-07
..                                ...           ...
183   mths_since_recent_revol_delinq7 -1.925346e-11
184   mths_since_recent_revol_delinq8  1.428952e-10
185   mths_since_recent_revol_delinq9  3.279213e-10
186  mths_since_recent_revol_delinq10  3.713171e-10
187  mths_since_recent_revol_delinq11  1.121122e-10

[186 rows x 2 columns]


## AUC Score

In [73]:
probs1 = lasso.predict_proba(X_test)
probs1 = probs1[:,1]
# probs1 = probs1[:,1]
y_test

[0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0

In [75]:
auc_1 = roc_auc_score(y_test, probs1)
print('LOGISTIC: ROC AUC=%.3f' %(auc_1))

fpr1, tpr1, thresholds = roc_curve(y_test,probs1)

LOGISTIC: ROC AUC=0.589


In [None]:
optimal_idx1 = np.argmax(tpr1 - fpr1)
optimal_threshold = thresholds[optimal_idx1]

optimal_threshold