In [64]:
# Importing Required Libraries for Analysis and Modeling
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold,RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from collections import Counter 
from sklearn.feature_selection import RFECV
from scipy.stats import uniform, loguniform
from sklearn.metrics import roc_auc_score, confusion_matrix,fbeta_score,make_scorer, precision_recall_curve,average_precision_score,roc_curve,classification_report
import datetime
from pandas.tseries.offsets import DateOffset
from sklearn.inspection import permutation_importance
from imblearn.over_sampling import SMOTENC
from scipy.sparse import hstack  
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [65]:
df = pd.read_csv('loan_tap_dataset.csv')

In [66]:
# Applying train test split before EDA. As its a imbalanced classification with stratification
X_train_validation, X_test, y_train_validation, y_test = train_test_split(df.drop(['loan_status'],axis = 1),df['loan_status'],random_state=40,test_size=0.2,stratify=df['loan_status'])

In [67]:
# Splitting into validation and training sets
X_train,X_validation,y_train,y_validation = train_test_split(X_train_validation,y_train_validation,random_state=40,test_size=0.2,stratify=y_train_validation)


In [None]:
# Label encoding on y_train - loan status
y_encoder = LabelEncoder( )
y_encoder.fit(y_train)
y_encoder.classes_= np.array(['Fully Paid','Charged Off'])
y_train_encoded = y_encoder.transform(y_train[X_train.index])


revol_util_median = X_train['revol_util'].quantile(0.5)
revol_util_imputed = X_train['revol_util'].fillna(revol_util_median)
k = 1000
title_words_freq = pd.Series(Counter(' '.join(list(X_train['title'].dropna().apply(lambda x: x.lower()))).split())).sort_values(ascending=False)
title_word_rank = title_words_freq.iloc[:k].reset_index().reset_index().set_index('index')['level_0']
def title_check(x):
    z = [title_word_rank.get(i,k+1) for i in x.lower().split()]
    return min(z)
k = 1000
emp_title_words_freq = pd.Series(Counter(' '.join(list(X_train['emp_title'].fillna('missing').apply(lambda x: x.lower()))).split())).sort_values(ascending=False)
emp_title_word_rank = emp_title_words_freq.iloc[:k].reset_index().reset_index().set_index('index')['level_0']
def emp_title_check(x):
    z = [emp_title_word_rank.get(i,k+1) for i in x.lower().split()]
    return min(z)

# Update your functions to work with individual values:
def extract_quarter_single(date_str):
    """Extract quarter from a single date string"""
    try:
        period = pd.to_datetime(date_str, format='%b-%Y').to_period('Q')
        return period.ordinal
    except:
        return None

def extract_month_single(date_str):
    """Extract month from a single date string"""
    try:
        return pd.to_datetime(date_str, format='%b-%Y').month
    except:
        return None
def emp_len_bin(x):
    if x < 1.5:
        return '1 yr'
    if 1.5 <= x < 7.5 :
        return '2-7 yrs'
    if 7.5 <= x < 9.5:
        return '8-9 yrs'
    if x >= 9.5 :
        return '10+ yrs'
def empl_length_num_single(emp_length_str):
    """Convert employment length string to numeric for a single value"""
    # You'll need to adapt your original empl_length_num function here
    # This is just an example - replace with your actual logic
    if pd.isna(emp_length_str):
        return None
    elif '10+' in str(emp_length_str):
        return 10
    elif '<' in str(emp_length_str):
        return 0
    else:
        return int(str(emp_length_str).split()[0])
def purpose_bin(val):
    if val in ['credit_card','home_improvement','major_purchase','educational','wedding','car','vacation','house']:
        return 'family'
    else:
        if val != 'small_business' :
            return 'other'
        else :
            return 'small_business'
           
def purpose_apply(x):
    return x.iloc[:, 0].apply(purpose_bin).to_frame()

def home_own_bin(val):
    if val not in ['MORTGAGE','OWN','RENT']:
        return 'RENT' 
    else:
        return val
def zip_extract(x):
    return x.iloc[:, 0].str.split().str[-1].to_frame()
# Updated DataFrame wrapper functions:
def extract_quarter_df(x):
    return x.iloc[:, 0].apply(extract_quarter_single).to_frame()

def extract_month_df(x):
    return x.iloc[:, 0].apply(extract_month_single).to_frame()

def empl_length_num_df(x):
    return x.iloc[:, 0].apply(empl_length_num_single).to_frame()

def home_ownership_cleanup_df(x):
    return x.iloc[:, 0].apply(home_own_bin).to_frame()

def title_check_df(x):
    return x.iloc[:, 0].fillna('debt').apply(title_check).to_frame()

def emp_title_check_df(x):
    return x.iloc[:, 0].fillna('missing').apply(emp_title_check).to_frame()
def emp_bin_ext(x):
    return pd.DataFrame(x).iloc[:, 0].apply(emp_len_bin).to_frame()
def clip_trans(x,upper = None,lower = None):
    return x.iloc[:, 0].clip(upper,lower).to_frame()
def clip_trans_df(x,upper=None,lower = None):
    return pd.DataFrame(x).iloc[:, 0].clip(upper,lower).to_frame()
# Your pipeline remains the same:
final_pipe = Pipeline([
    ('preprocessing', ColumnTransformer(transformers=[
        ('zipcode', Pipeline([
            ('zipcode', FunctionTransformer(zip_extract,feature_names_out='zipcode')),
            ('onehot', OneHotEncoder(sparse_output=False, drop='first'))
        ]), ['address']),
        ('term', OneHotEncoder(sparse_output=False, drop='first'), ['term']),
        ('homeownership', Pipeline([
            ('home_ownership_cleanup', FunctionTransformer(home_ownership_cleanup_df)),
            ('home_encoding', OneHotEncoder(sparse_output=False, drop='first'))
        ]), ['home_ownership']),
        ('verif_st', OneHotEncoder(sparse_output=False, drop='first'), ['verification_status']),
        ('emp_length', Pipeline([
            ('emp_len_num', FunctionTransformer(empl_length_num_df)),
            ('emp_imp', KNNImputer(n_neighbors=1)),
            ('emp_bin',FunctionTransformer(emp_bin_ext)),
            ('emp_len_oh',OneHotEncoder(sparse_output=False,drop='first'))
        ]), ['emp_length']),
        ('loan_amnt', FunctionTransformer(clip_trans , kw_args={"upper" :38000}), ['loan_amnt']),
        ('int_rate', 'passthrough', ['int_rate']),
        ('open_acc', FunctionTransformer(clip_trans , kw_args={"upper" :23}), ['open_acc']),
        ('total_acc', FunctionTransformer(clip_trans , kw_args={"upper" :54}), ['total_acc']),
        ('annual_inc', FunctionTransformer(clip_trans , kw_args={"upper" :150000}), ['annual_inc']),
        ('revol_bal', FunctionTransformer(clip_trans , kw_args={"upper" :40000}), ['revol_bal']),
        ('dti', FunctionTransformer(clip_trans , kw_args={"upper" :40,"lower":0}), ['dti']),
        ('revol_util',Pipeline([
            ('revol_util_imp', SimpleImputer(strategy='constant',fill_value=revol_util_median)),
            ('revol_util_clip', FunctionTransformer(clip_trans_df , kw_args={"upper" :129,"lower":0}))
        ]), ['revol_util']),
        ('purpose', Pipeline([
            ("group_purpose", FunctionTransformer(purpose_apply)),
            ('purpose_encoding', OneHotEncoder(sparse_output=False, drop='first'))
        ]), ['purpose']),
        ('issue_qtr', FunctionTransformer(extract_quarter_df), ['issue_d']),
        ('issue_month', FunctionTransformer(extract_month_df), ['issue_d']),
        ('mort_acc', Pipeline([
            ('mort_acc_imp', KNNImputer(n_neighbors=1)),
            ('mort_acc_clip',FunctionTransformer(clip_trans_df , kw_args={"upper" :7}))
        ]), ['mort_acc']),
        ('pub_rec', FunctionTransformer(clip_trans , kw_args={"upper" :1}), ['pub_rec']),
        ('earliest_cr_qtr', FunctionTransformer(extract_quarter_df), ['earliest_cr_line'])
    ], remainder='drop')),
    ('scaler', StandardScaler()),
    ('logistic_reg', LogisticRegression(
        C=np.float64(0.00025502648504032837),
        class_weight={0: 1, 1: 5}, 
        max_iter=1000, 
        penalty=None
    ))
])

In [69]:
try:
    final_pipe.fit(X_train, y_train_encoded)  # or whatever data you're using
except Exception as e:
    import traceback
    traceback.print_exc()




In [70]:
y_predicted = final_pipe.predict(X_test)

In [71]:
print(classification_report(y_encoder.transform(y_test),y_predicted))

              precision    recall  f1-score   support

           0       0.95      0.75      0.84     63671
           1       0.45      0.85      0.59     15535

    accuracy                           0.77     79206
   macro avg       0.70      0.80      0.71     79206
weighted avg       0.85      0.77      0.79     79206



In [72]:
print(classification_report(y_encoder.transform(y_train),final_pipe.predict(X_train)))

              precision    recall  f1-score   support

           0       0.95      0.75      0.84    203749
           1       0.45      0.85      0.59     49710

    accuracy                           0.77    253459
   macro avg       0.70      0.80      0.71    253459
weighted avg       0.85      0.77      0.79    253459



In [73]:
import joblib
# Save entire pipeline (including preprocessing)
joblib.dump(final_pipe, 'pipeline_1.joblib')

['pipeline_1.joblib']

In [24]:
from flask import Flask
app = Flask(__name__)

@app.route('/')
def hello():
    return "Hello World!"

if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1