In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

In [2]:
PATH = "accept.csv"
chunk_size = 50000 


y_map = {
    "Fully Paid": 1,
    "Current": 1,
    "Charged Off": 0,
    "In Grace Period": 0,
    "Late (31-120 days)": 0
}


In [3]:
incremental_models = {
    "SGD": SGDClassifier(loss="log_loss", random_state=42),
    "Perceptron": Perceptron(),
    "PassiveAggressive": PassiveAggressiveClassifier(),
}

preprocessor = None 


In [4]:
classes = np.array([0, 1]) 
is_first = True  

In [5]:
drop_cols = ["earliest_cr_line", "sec_app_earliest_cr_line","member_id"] 

for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
    if "loan_status" not in chunk.columns:
        continue
    y = chunk["loan_status"].map(y_map).dropna()
    X = chunk.loc[y.index].drop(columns=["loan_status"])
    X = X.dropna(axis=1, how='all')

    X = X.drop(columns=[col for col in drop_cols if col in X.columns])

    if preprocessor is None:
        numeric_features = X.select_dtypes(include=["int64","float64"]).columns
        categorical_features = X.select_dtypes(include=["object"]).columns

        preprocessor = ColumnTransformer(
           transformers=[
            ("num", Pipeline([
                ("imputer", SimpleImputer(strategy="mean")),   # fill NaN with mean
                ("scaler", StandardScaler())
            ]), numeric_features),
            
            ("cat", Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),  # fill NaN with mode
                ("onehot", OneHotEncoder(handle_unknown="ignore"))
            ]), categorical_features),
          ]
        )

        X_proc = preprocessor.fit_transform(X)
    else:
        missing_cols = set(preprocessor.feature_names_in_) - set(X.columns)
        for col in missing_cols:
            X[col] = 0  
        X_proc = preprocessor.transform(X[preprocessor.feature_names_in_])
    
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_proc, y)

    for name, model in incremental_models.items():
        if is_first:
            model.partial_fit(X_proc, y, classes=classes)
        else:
            model.partial_fit(X_proc, y)

    is_first = False

print("\n✅ Training finished across all chunks.")

  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(PATH, chunksize=chunk_size)):
  for i, chunk in en


✅ Training finished across all chunks.


In [6]:
test_chunk = next(pd.read_csv(PATH, chunksize=chunk_size))
y_test = test_chunk["loan_status"].map(y_map).dropna()
X_test = test_chunk.loc[y_test.index].drop(columns=["loan_status"])
X_test_proc = preprocessor.transform(X_test)

for name, model in incremental_models.items():
    y_pred = model.predict(X_test_proc)
    print(f"\n{name} Test Report:")
    print(classification_report(y_test, y_pred))

  test_chunk = next(pd.read_csv(PATH, chunksize=chunk_size))



SGD Test Report:
              precision    recall  f1-score   support

         0.0       0.86      0.98      0.91      9373
         1.0       0.99      0.96      0.98     40588

    accuracy                           0.97     49961
   macro avg       0.93      0.97      0.95     49961
weighted avg       0.97      0.97      0.97     49961


Perceptron Test Report:
              precision    recall  f1-score   support

         0.0       0.75      0.98      0.85      9373
         1.0       0.99      0.92      0.96     40588

    accuracy                           0.93     49961
   macro avg       0.87      0.95      0.90     49961
weighted avg       0.95      0.93      0.94     49961


PassiveAggressive Test Report:
              precision    recall  f1-score   support

         0.0       0.78      0.98      0.87      9373
         1.0       1.00      0.93      0.96     40588

    accuracy                           0.94     49961
   macro avg       0.89      0.96      0.92     49961

In [7]:
import joblib
joblib.dump(incremental_models['SGD'], "loan.pkl")
joblib.dump(preprocessor, "preprocessor.pkl")

['preprocessor.pkl']