In [51]:
%reload_kedro

In [2]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import missingno as msno


In [3]:
from src.ocp7_scoring_model_cloud.pipelines.preprocessing.nodes import preprocess_merge_datasets

# Pipeline 1: preprocessing

In [None]:
df_train = catalog.load("application_train")
df_test = catalog.load("application_test")

In [None]:
bureau_df = catalog.load("bureau")
bureau_balance_df = catalog.load("bureau_balance")
previous_application_df = catalog.load("previous_application")
pos_cash_df = catalog.load("pos_cash_balance")
installments_payments_df = catalog.load("installments_payments")
credit_card_balance_df = catalog.load("credit_card_balance")

In [None]:
preprocessed_train_df = preprocess_merge_datasets(df_train, df_test, bureau_df, bureau_balance_df, previous_application_df, pos_cash_df, installments_payments_df, credit_card_balance_df)
catalog.save("preprocessed_train_df", preprocessed_train_df)

In [None]:
preprocessed_test_df = preprocess_merge_datasets(df_train, df_test, bureau_df, bureau_balance_df, previous_application_df, pos_cash_df, installments_payments_df, credit_card_balance_df)
catalog.save("preprocessed_test_df", preprocessed_test_df)

# Pipeline 1.2: feature selection and processing

In [6]:
preprocessed_train_df = catalog.load("preprocessed_train_df")
preprocessed_test_df = catalog.load("preprocessed_test_df")

In [7]:
print(preprocessed_train_df.shape, preprocessed_test_df.shape)

(307507, 797) (48744, 794)


In [8]:
from src.ocp7_scoring_model_cloud.pipelines.feature_processing.nodes import get_clean_features, process_features_for_ml

In [9]:
cleaned_features_train_df = get_clean_features(preprocessed_train_df)

In [10]:
cleaned_features_test_df = get_clean_features(preprocessed_test_df)

In [35]:
feature_pool_test = list(set(cleaned_features_train_df.columns)&(set(cleaned_features_test_df.columns)))
feature_pool_train = feature_pool_test + ["TARGET"]


In [34]:
selected_features_train_df = cleaned_features_train_df[feature_pool_train]

In [36]:
selected_features_test_df = cleaned_features_test_df[feature_pool_test]

In [37]:
ml_features_train = process_features_for_ml(selected_features_train_df)
ml_features_test = process_features_for_ml(selected_features_test_df)

Training Features shape:  (307507, 470)
Training Features shape:  (48744, 469)


In [38]:
print(ml_features_train.shape, ml_features_test.shape)
catalog.save("full_df_test", ml_features_test)
catalog.save("full_df_train", ml_features_train)

(307507, 471) (48744, 470)


# Pipeline 1.3: treat imbalanced classes

In [89]:
features = catalog.load("processed_features_df")
preprocessed_df = catalog.load("preprocessed_df")
id_target = preprocessed_df[["SK_ID_CURR", "TARGET"]]
full_df = pd.concat([id_target, features], axis=1)

In [92]:
from imblearn.over_sampling import SMOTE

In [93]:
def treat_imbalanced_classes(df):
    smote = SMOTE(sampling_strategy='minority')
    X = df.drop(["TARGET", "SK_ID_CURR"], axis=1) # Features
    y = df["TARGET"]  # Target variable
    X_sm, y_sm = smote.fit_resample(X, y)
    return X_sm, y_sm

In [95]:
balanced_X, balanced_y = treat_imbalanced_classes(full_df)

In [96]:
balanced_df = pd.concat([balanced_y, balanced_X], axis=1)

In [99]:
catalog.save("balanced_df", balanced_df)
balanced_df.shape

[1m([0m[1;36m565364[0m, [1;36m488[0m[1m)[0m

# Pipeline 2: training model

In [39]:
full_df_train = catalog.load("full_df_train")

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [41]:
def split_data(df, train_size=0.8, test_size=0.2, random_state=42):
    features = [f for f in df.columns if f not in ["SK_ID_CURR", "TARGET"]]
    X = df[features] # Features
    y = df["TARGET"]  # Target variable
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def train_model(X_train, y_train, model):
    model.fit(X_train, y_train)
    return model

def evaluate_model(X_test, y_test, model):
    y_pred = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, y_pred)
    return roc_auc

In [42]:
X_train, X_test, y_train, y_test = split_data(full_df_train, train_size=0.8, test_size=0.2)
model = train_model(X_train, y_train, LogisticRegression(C = 0.0001))
metric = evaluate_model(X_test, y_test, model)
print("ROC AUC score = ", metric)

ROC AUC score =  0.7095963635332831


In [43]:
#save the model
catalog.save("first_model_logistic_regression", model)


# Pipeline 3: Predictions

In [44]:
full_df_test = catalog.load("full_df_test")

In [45]:
model = catalog.load("first_model_logistic_regression")

In [46]:
def predict(model, df):
    features = [f for f in df.columns if f not in ["SK_ID_CURR", "TARGET"]]
    X = df[features] # Features
    y_pred = model.predict_proba(X)[:,1]
    return y_pred

In [47]:
y_pred = predict(model, full_df_test)

In [49]:
output_df = pd.DataFrame({"SK_ID_CURR": full_df_test["SK_ID_CURR"], "TARGET": y_pred})

In [52]:
catalog.save("kaggle_output_df", output_df)

# Pipeline 5: MlFlow model serving

In [11]:
from mlflow.models.signature import infer_signature

In [18]:
import mlflow.sklearn

In [16]:
X_train, X_test, y_train, y_test = split_data(full_df, train_size=0.8, test_size=0.2)

In [17]:
signature = infer_signature(X_train, y_train)

#test to get predict_proba instead of predict method

In [35]:
import mlflow.pyfunc

class CustomModelWrapper(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        import joblib
        self.model = joblib.load(context.artifacts["model_path"])
    
    def predict(self, context, model_input):
        return self.model.predict_proba(model_input)

In [36]:
import joblib
import mlflow.pyfunc

# Example sklearn model
model = catalog.load("first_model_logistic_regression")  # Load or train your sklearn model here
joblib.dump(model, "model.pkl")

custom_model = CustomModelWrapper()
custom_model_path = "custom_model"
mlflow.pyfunc.save_model(
    path=custom_model_path,
    python_model=custom_model,
    artifacts={"model_path": "model.pkl"}
)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
mlflow.sklearn.save_model(model, 'mlflow_model', signature=signature)

# Pipeline 5: MlFlow model serving

model registry

# Test API REST

In [34]:
predict(model, full_df.head(2))


[1;35marray[0m[1m([0m[1m[[0m[1;36m0.11439352[0m, [1;36m0.03989982[0m, [1;36m0.07970802[0m, [1;36m0.10639634[0m, [1;36m0.08963531[0m,
       [1;36m0.08212018[0m, [1;36m0.04388432[0m, [1;36m0.06208959[0m, [1;36m0.06617215[0m, [1;36m0.09656168[0m,
       [1;36m0.06654831[0m, [1;36m0.04556897[0m, [1;36m0.06973686[0m, [1;36m0.05738803[0m, [1;36m0.06681301[0m,
       [1;36m0.12395221[0m, [1;36m0.12629016[0m, [1;36m0.07066658[0m, [1;36m0.06576296[0m, [1;36m0.06296205[0m,
       [1;36m0.08043643[0m, [1;36m0.08470064[0m, [1;36m0.10094746[0m, [1;36m0.06227812[0m, [1;36m0.07243678[0m,
       [1;36m0.07855003[0m, [1;36m0.09558423[0m, [1;36m0.08621617[0m, [1;36m0.07746951[0m, [1;36m0.08164195[0m,
       [1;36m0.09315339[0m, [1;36m0.10541672[0m, [1;36m0.07569377[0m, [1;36m0.09391593[0m, [1;36m0.04744398[0m,
       [1;36m0.05600689[0m, [1;36m0.06807798[0m, [1;36m0.0813706[0m , [1;36m0.05040415[0m, [1;36m0.06903112[

In [23]:
df = full_df.head(10)
features = [f for f in df.columns if f not in ["SK_ID_CURR", "TARGET"]]
df_query = df[features]

In [24]:
df_query

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,INSTAL_AMT_INSTALMENT_MEAN,INSTAL_AMT_INSTALMENT_SUM,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_MAX,INSTAL_AMT_PAYMENT_MEAN,INSTAL_AMT_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,INSTAL_DAYS_ENTRY_PAYMENT_SUM,INSTAL_COUNT
0,0.0,0.0,0.0,0.0,0.001512,0.090287,0.090032,0.077441,0.256321,0.888839,...,0.004615,0.009436,0.003693928,0.014078,0.004615,0.0086,0.984365,0.898168,0.990053,0.048518
1,1.0,0.0,1.0,0.0,0.002089,0.311736,0.132924,0.271605,0.045016,0.477114,...,0.025854,0.069555,0.002660304,0.148704,0.025854,0.063393,0.823127,0.549439,0.942518,0.06469
2,0.0,1.0,0.0,0.0,0.000358,0.022472,0.020025,0.023569,0.134897,0.348534,...,0.002833,0.000915,0.002138973,0.002804,0.002833,0.000834,0.763518,0.752716,0.996207,0.005391
3,1.0,0.0,0.0,0.0,0.000935,0.066837,0.109477,0.063973,0.107023,0.350846,...,0.025133,0.043272,0.000991348,0.183425,0.025133,0.039439,0.996417,0.912443,0.992787,0.040431
4,0.0,0.0,0.0,0.0,0.000819,0.116854,0.078975,0.117845,0.39288,0.298591,...,0.005057,0.035918,7.186806e-08,0.006013,0.004877,0.031567,0.995765,0.664523,0.886924,0.175202
5,0.0,0.0,0.0,0.0,0.000627,0.111235,0.101018,0.103255,0.491595,0.467193,...,0.011061,0.041659,9.069749e-05,0.114601,0.010924,0.037499,0.973616,0.597523,0.928094,0.091644
6,1.0,1.0,0.0,0.052632,0.001243,0.378458,0.154774,0.337823,0.491595,0.64549,...,0.00382,0.020967,0.0024576,0.004598,0.00382,0.019109,0.981433,0.719227,0.92683,0.134771
7,0.0,1.0,0.0,0.0,0.002858,0.370787,0.157792,0.371493,0.039215,0.359583,...,0.01096,0.011794,0.01090853,0.007282,0.010959,0.010749,0.748208,0.702445,0.984798,0.024259
8,1.0,0.0,0.0,0.0,0.000742,0.243348,0.125623,0.217733,0.254009,0.289177,...,0.00542,0.075827,0.0001795444,0.012325,0.004523,0.057671,0.988274,0.62584,0.751668,0.347709
9,0.0,0.0,0.0,0.0,0.000935,0.089888,0.072675,0.090909,0.268617,0.606539,...,0.003827,0.019766,2.335712e-06,0.015561,0.004173,0.019644,0.95114,0.775974,0.945004,0.126685


In [27]:
df_query

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,INSTAL_AMT_INSTALMENT_MEAN,INSTAL_AMT_INSTALMENT_SUM,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_MAX,INSTAL_AMT_PAYMENT_MEAN,INSTAL_AMT_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,INSTAL_DAYS_ENTRY_PAYMENT_SUM,INSTAL_COUNT
0,0.0,0.0,0.0,0.0,0.001512,0.090287,0.090032,0.077441,0.256321,0.888839,...,0.004615,0.009436,0.003693928,0.014078,0.004615,0.0086,0.984365,0.898168,0.990053,0.048518
1,1.0,0.0,1.0,0.0,0.002089,0.311736,0.132924,0.271605,0.045016,0.477114,...,0.025854,0.069555,0.002660304,0.148704,0.025854,0.063393,0.823127,0.549439,0.942518,0.06469
2,0.0,1.0,0.0,0.0,0.000358,0.022472,0.020025,0.023569,0.134897,0.348534,...,0.002833,0.000915,0.002138973,0.002804,0.002833,0.000834,0.763518,0.752716,0.996207,0.005391
3,1.0,0.0,0.0,0.0,0.000935,0.066837,0.109477,0.063973,0.107023,0.350846,...,0.025133,0.043272,0.000991348,0.183425,0.025133,0.039439,0.996417,0.912443,0.992787,0.040431
4,0.0,0.0,0.0,0.0,0.000819,0.116854,0.078975,0.117845,0.39288,0.298591,...,0.005057,0.035918,7.186806e-08,0.006013,0.004877,0.031567,0.995765,0.664523,0.886924,0.175202
5,0.0,0.0,0.0,0.0,0.000627,0.111235,0.101018,0.103255,0.491595,0.467193,...,0.011061,0.041659,9.069749e-05,0.114601,0.010924,0.037499,0.973616,0.597523,0.928094,0.091644
6,1.0,1.0,0.0,0.052632,0.001243,0.378458,0.154774,0.337823,0.491595,0.64549,...,0.00382,0.020967,0.0024576,0.004598,0.00382,0.019109,0.981433,0.719227,0.92683,0.134771
7,0.0,1.0,0.0,0.0,0.002858,0.370787,0.157792,0.371493,0.039215,0.359583,...,0.01096,0.011794,0.01090853,0.007282,0.010959,0.010749,0.748208,0.702445,0.984798,0.024259
8,1.0,0.0,0.0,0.0,0.000742,0.243348,0.125623,0.217733,0.254009,0.289177,...,0.00542,0.075827,0.0001795444,0.012325,0.004523,0.057671,0.988274,0.62584,0.751668,0.347709
9,0.0,0.0,0.0,0.0,0.000935,0.089888,0.072675,0.090909,0.268617,0.606539,...,0.003827,0.019766,2.335712e-06,0.015561,0.004173,0.019644,0.95114,0.775974,0.945004,0.126685


In [31]:
import requests
import pandas as pd
def request_prediction(model_uri, data):
    # Convert DataFrame to a list of dictionaries (records format)
    data_records = data.to_dict(orient='records')
    
    # Create the input payload
    data = {'dataframe_records': data_records}
    # Set up the request headers and URL
    headers = {'Content-Type': 'application/json'}
    url = "http://localhost:5000/invocations"
    
    # Send the POST request
    response = requests.post(url, headers=headers, json=data)
    
    # Check the response
    if response.status_code == 200:
        predictions = response.json()
        return predictions
    else:
        return print("Error:", response.status_code, response.text)


In [33]:
request_prediction("http://", df_query)


[1m{[0m
    [32m'predictions'[0m: [1m[[0m
        [1m[[0m[1;36m0.8856064814516993[0m, [1;36m0.11439351854830068[0m[1m][0m,
        [1m[[0m[1;36m0.9601001795927657[0m, [1;36m0.039899820407234325[0m[1m][0m,
        [1m[[0m[1;36m0.920291983594061[0m, [1;36m0.0797080164059391[0m[1m][0m,
        [1m[[0m[1;36m0.8936036603311639[0m, [1;36m0.10639633966883615[0m[1m][0m,
        [1m[[0m[1;36m0.9103646889350232[0m, [1;36m0.08963531106497674[0m[1m][0m,
        [1m[[0m[1;36m0.9178798161410359[0m, [1;36m0.08212018385896416[0m[1m][0m,
        [1m[[0m[1;36m0.9561156790957355[0m, [1;36m0.04388432090426451[0m[1m][0m,
        [1m[[0m[1;36m0.9379104137143044[0m, [1;36m0.06208958628569562[0m[1m][0m,
        [1m[[0m[1;36m0.9338278547840814[0m, [1;36m0.0661721452159186[0m[1m][0m,
        [1m[[0m[1;36m0.9034383238781599[0m, [1;36m0.09656167612184006[0m[1m][0m
    [1m][0m
[1m}[0m

In [35]:
full_df[full_df["TARGET"]==1].head(10)

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,INSTAL_AMT_INSTALMENT_MEAN,INSTAL_AMT_INSTALMENT_SUM,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_MAX,INSTAL_AMT_PAYMENT_MEAN,INSTAL_AMT_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,INSTAL_DAYS_ENTRY_PAYMENT_SUM,INSTAL_COUNT
0,100002,1,0.0,0.0,0.0,0.0,0.001512,0.090287,0.090032,0.077441,...,0.004615,0.009436,0.003694,0.014078,0.004615,0.0086,0.984365,0.898168,0.990053,0.048518
26,100031,1,1.0,0.0,0.0,0.0,0.000742,0.233456,0.099298,0.164983,...,0.00502,0.014108,0.000924,0.009545,0.004881,0.012476,0.978827,0.737753,0.964934,0.06469
40,100047,1,0.0,0.0,0.0,0.0,0.001512,0.286787,0.130309,0.203143,...,0.005361,0.011538,0.001828,0.032515,0.005361,0.010516,0.618567,0.385561,0.937324,0.051213
42,100049,1,1.0,0.0,1.0,0.0,0.000935,0.060892,0.057108,0.049383,...,0.002021,0.012616,5e-06,0.002068,0.00191,0.010863,0.990879,0.719215,0.916783,0.153639
81,100096,1,1.0,0.0,0.0,0.0,0.000473,0.051685,0.050614,0.05275,...,0.00502,0.014108,0.000924,0.009545,0.004881,0.012476,0.978827,0.737753,0.964934,0.06469
94,100112,1,0.0,1.0,0.0,0.0,0.002474,0.226831,0.243717,0.214366,...,0.006282,0.01352,0.001788,0.011932,0.007629,0.014965,0.961889,0.784404,0.977944,0.051213
110,100130,1,1.0,0.0,0.0,0.052632,0.001127,0.169537,0.113829,0.135802,...,0.00681,0.009526,0.00449,0.01929,0.00681,0.008683,0.988274,0.894143,0.992928,0.032345
138,100160,1,0.0,0.0,0.0,0.0,0.002281,0.157303,0.137013,0.158249,...,0.00502,0.014108,0.000924,0.009545,0.004881,0.012476,0.978827,0.737753,0.964934,0.06469
154,100181,1,1.0,0.0,0.0,0.0,0.001127,0.050092,0.043103,0.031425,...,0.00463,0.021422,0.000687,0.005566,0.004629,0.019524,0.988925,0.72118,0.938735,0.113208
163,100192,1,1.0,0.0,1.0,0.0,0.000737,0.044944,0.075746,0.046016,...,0.00502,0.014108,0.000924,0.009545,0.004881,0.012476,0.978827,0.737753,0.964934,0.06469
