In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import time 

from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,roc_auc_score,classification_report,roc_curve,auc, f1_score


In [14]:
# Missing value
def remove_missing_col(df, threshold = 0.66):
    """
    Drop columns from the DataFrame where the percentage of missing values is greater than the threshold.

    Parameters:
    - df: pandas DataFrame
    - threshold: float, default=0.6
        The threshold for the percentage of missing values. Columns with missing values percentage
        greater than this threshold will be dropped.

    Returns:
    - df: pandas DataFrame
        DataFrame with columns dropped based on the specified threshold.
    """
    # Calculate the percentage of missing values for each column
    missing_percentage = df.isnull().mean()

    # Identify columns where the missing percentage is greater than the threshold
    columns_to_drop = missing_percentage[missing_percentage > threshold].index

    # Drop the identified columns from the DataFrame
    df = df.drop(columns=columns_to_drop)

    return df

def fill_nan(df):
    numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
    if 'TARGET' in numeric_columns:
        numeric_columns.remove('TARGET')
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
    
    categorical_columns = df.dtypes[df.dtypes == 'object'].index.tolist()
    df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode())
    return df

def encode(df):
    label = LabelEncoder()
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    for col in categorical_cols:
        nunique_cols = df[col].nunique()
        if nunique_cols == 2:
            df[col] = label.fit_transform(df[col])
 
    df = pd.get_dummies(df)
    return df
def label_encoding(df):
    obj_cols = [c for c in df.columns if df[c].dtype=='O']
    for c in obj_cols:
        df[c] = pd.factorize(df[c])[0]
    df[obj_cols].replace(-1, np.nan, inplace=True)
#     df[obj_cols] = df[obj_cols].fillna(df[obj_cols].mode())

    return df

In [15]:
def logistic_cv_train(name, params, X, y, X_test, num_folds, metric=roc_auc_score,verbose_cv=True, msgs={}):
    pred_test = np.zeros((X_test.shape[0],))
    pred_val = np.zeros((X.shape[0],))
    cv_scores = []
    models = []
    kfolds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

    for train_index, valid_index in kfolds.split(X, y):
        print('[level 1] processing fold...')
        t0 = time.time()

        # Split data
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]

        # Fit model
        model = LogisticRegression(**params)
        model.fit(X_train, y_train)

        # Predictions
        pred_val[valid_index] = model.predict_proba(X_valid)[:, 1]
        pred_test += model.predict_proba(X_test)[:, 1] / num_folds

        # Evaluate
        scr = metric(y_valid, pred_val[valid_index])
        if verbose_cv:
            print(f'{name} auc:', scr, 
                  f'fold done in {time.time() - t0:.2f} s')
        cv_scores.append(scr)
        models.append(model)

    msgs = dict(
        msgs, 
        cv_score_mean=np.mean(cv_scores), 
        cv_score_std=np.std(cv_scores),
        cv_score_min=np.min(cv_scores), 
        cv_score_max=np.max(cv_scores),
    )
    print(msgs)

    result = dict(
        name=name,
        pred_val=pred_val,
        pred_test=pred_test,
        cv_scores=cv_scores,
        models=models
    )
    return result

In [16]:
application_train = pd.read_csv('/kaggle/input/dseb-63-data-preparation-final-project/dseb63_final_project_DP_dataset/dseb63_final_project_DP_dataset/dseb63_application_train.csv')
application_test = pd.read_csv('/kaggle/input/dseb-63-data-preparation-final-project/dseb63_final_project_DP_dataset/dseb63_final_project_DP_dataset/dseb63_application_test.csv')

df1 = application_train[["SK_ID_CURR", "TARGET"]]
df2 = application_test[["SK_ID_CURR"]]
df3 = pd.concat([df1,df2])
sk_id_curr = df3['SK_ID_CURR']
id_label = df3[['SK_ID_CURR']]

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

# Make the model with the specified regularization parameter

def predict_target(df, name):
    application_train = pd.read_csv('/kaggle/input/dseb-63-data-preparation-final-project/dseb63_final_project_DP_dataset/dseb63_final_project_DP_dataset/dseb63_application_train.csv')
    #application_test = pd.read_csv('/kaggle/input/dseb-63-data-preparation-final-project/dseb63_final_project_DP_dataset/dseb63_final_project_DP_dataset/dseb63_application_test.csv')
    
    df1 = application_train[["SK_ID_CURR", "TARGET"]]
    df2 = application_test[["SK_ID_CURR"]]
    df3 = pd.concat([df1,df2])
    
    sk_id_curr = df3['SK_ID_CURR']
    id_label = df3[['SK_ID_CURR', 'TARGET']]
    df = df.loc[np.isin(df['SK_ID_CURR'], sk_id_curr)]
    df = df.merge(id_label, how='left', on='SK_ID_CURR')
    print(df['TARGET'].value_counts(dropna = False))

    numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
    if 'TARGET' in numeric_columns:
        numeric_columns.remove('TARGET')
    df['TARGET'] = df['TARGET'].fillna(-1)
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
    df = label_encoding(df)
    
    x_train = df[df['TARGET'] != -1].drop(['TARGET', 'SK_ID_CURR'], axis=1).values
    y_train = np.array(df.loc[df['TARGET'] != -1, 'TARGET'])

    # Filter out rows where target is equal to -1 for x_test
    x_test = df[df['TARGET'] == -1].drop(['TARGET', 'SK_ID_CURR'], axis=1).values
    
    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.fit_transform(x_test)
    
    params = {'solver': 'saga', 'max_iter': 100, 'C': 0.1, 'penalty': 'l1'}
    result = logistic_cv_train('bureau', params, x_train, y_train, x_test, 5)
    
    print(f"result['pred_test'] : {result['pred_test'].shape}")
    print(f"result['pred_val'] : {result['pred_val'].shape}")
    
    df.loc[df['TARGET'] == -1, 'PRED_TARGET'] = result['pred_test']
    df.loc[df['TARGET'] != -1, 'PRED_TARGET'] = result['pred_val']
    print(df['PRED_TARGET'].shape)
    #new_feature = df['PRED_TARGET']
    
    df = pd.merge(df3['SK_ID_CURR'], df, on = 'SK_ID_CURR', how = 'left')
    
    # Aggregating
    aggregations_basic = {'PRED_TARGET' : ['max', 'min', 'median','mean', 'std', 'size', 'sum','skew', pd.DataFrame.kurt]}
    new_feature = df.groupby(['SK_ID_CURR']).agg(aggregations_basic)
    new_feature.columns = [name + '_'.join(ele).upper() for ele in new_feature.columns]
    new_feature.reset_index(drop = False, inplace = True)
    return new_feature
 

In [18]:
# compare = pd.read_csv('/kaggle/input/compare/pred_stats.csv')
# compare

In [19]:
# compare.sort_values(by = 'SK_ID_CURR')

### BUREAU

In [20]:
bureau = pd.read_csv('/kaggle/input/dseb-63-data-preparation-final-project/dseb63_final_project_DP_dataset/dseb63_final_project_DP_dataset/dseb63_bureau.csv')

In [21]:
new_feature1 = predict_target(bureau, 'BUREAU_')

TARGET
0.0    1081617
NaN     291947
1.0      91761
Name: count, dtype: int64
[level 1] processing fold...
bureau auc: 0.5754986587774705 fold done in 10.84 s
[level 1] processing fold...
bureau auc: 0.5773288870336847 fold done in 10.76 s
[level 1] processing fold...
bureau auc: 0.5750765507810096 fold done in 11.45 s
[level 1] processing fold...
bureau auc: 0.5767163841252256 fold done in 12.43 s
[level 1] processing fold...
bureau auc: 0.5686600544772886 fold done in 12.74 s
{'cv_score_mean': 0.5746561070389358, 'cv_score_std': 0.0031057304218818907, 'cv_score_min': 0.5686600544772886, 'cv_score_max': 0.5773288870336847}
result['pred_test'] : (291947,)
result['pred_val'] : (1173378,)
(1465325,)


In [22]:
new_feature1.shape

(307511, 10)

In [23]:
new_feature1

Unnamed: 0,SK_ID_CURR,BUREAU_PRED_TARGET_MAX,BUREAU_PRED_TARGET_MIN,BUREAU_PRED_TARGET_MEDIAN,BUREAU_PRED_TARGET_MEAN,BUREAU_PRED_TARGET_STD,BUREAU_PRED_TARGET_SIZE,BUREAU_PRED_TARGET_SUM,BUREAU_PRED_TARGET_SKEW,BUREAU_PRED_TARGET_KURT
0,0,0.104702,0.104702,0.104702,0.104702,,1,0.104702,,
1,1,0.057289,0.057289,0.057289,0.057289,,1,0.057289,,
2,2,0.108553,0.047776,0.089058,0.086027,0.022989,6,0.516164,-0.877784,0.411802
3,3,0.107628,0.049150,0.083469,0.082163,0.023482,7,0.575141,-0.641417,-1.308238
4,4,0.096046,0.072072,0.081767,0.083464,0.011169,6,0.500786,0.170373,-2.795659
...,...,...,...,...,...,...,...,...,...,...
307506,307506,0.074391,0.068642,0.071433,0.071475,0.003191,4,0.285899,0.006413,-5.943142
307507,307507,,,,,,1,0.000000,,
307508,307508,0.145779,0.066487,0.133061,0.115117,0.031376,7,0.805817,-0.644021,-1.489377
307509,307509,0.110245,0.048925,0.079585,0.079585,0.043360,2,0.159170,,


In [24]:
#new_feature1.to_csv('bureau_new_feature-2.csv', index = False)

### Previous_application

In [25]:
pre_application = pd.read_csv('/kaggle/input/dseb-63-data-preparation-final-project/dseb63_final_project_DP_dataset/dseb63_final_project_DP_dataset/dseb63_previous_application.csv')

In [26]:
pre_application.head(10)

Unnamed: 0,SK_ID_PREV,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,...,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,SK_ID_CURR
0,2030495,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,...,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0,293189
1,1696966,Consumer loans,68258.655,1800000.0,1754721.0,180000.0,1800000.0,SATURDAY,18,Y,...,36.0,low_normal,POS industry with interest,,,,,,,293189
2,2154916,Consumer loans,12417.39,108400.5,119848.5,0.0,108400.5,SUNDAY,14,Y,...,12.0,middle,POS industry with interest,365243.0,-512.0,-182.0,-392.0,-387.0,0.0,293189
3,2802425,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,...,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0,91587
4,1536272,Cash loans,21709.125,450000.0,512370.0,,450000.0,WEDNESDAY,9,Y,...,36.0,low_normal,Cash X-Sell: low,365243.0,-485.0,565.0,-155.0,-147.0,1.0,91587
5,2068863,Consumer loans,4830.93,47250.0,23688.0,24750.0,47250.0,THURSDAY,11,Y,...,6.0,high,POS household with interest,365243.0,-588.0,-438.0,-588.0,-580.0,0.0,91587
6,2551979,Consumer loans,6664.275,71352.0,71352.0,0.0,71352.0,WEDNESDAY,9,Y,...,12.0,low_normal,POS industry with interest,365243.0,-1176.0,-846.0,-846.0,-840.0,0.0,91587
7,2517198,Revolving loans,11250.0,0.0,225000.0,,,TUESDAY,13,Y,...,0.0,XNA,Card X-Sell,-713.0,-673.0,365243.0,-461.0,-61.0,0.0,91587
8,1760610,Consumer loans,8593.965,33052.5,33052.5,0.0,33052.5,SUNDAY,10,Y,...,4.0,low_action,POS industry with interest,365243.0,-783.0,-693.0,-753.0,-748.0,0.0,91587
9,2523466,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,...,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0,256489


In [27]:
pre_application['SK_ID_CURR'].nunique()

291057

In [28]:
new_feature2 = predict_target(pre_application, 'PRE_APPLICATION_')

TARGET
0.0    1033268
NaN     282810
1.0      97623
Name: count, dtype: int64
[level 1] processing fold...
bureau auc: 0.590261674746981 fold done in 19.00 s
[level 1] processing fold...
bureau auc: 0.5902535589116454 fold done in 18.14 s
[level 1] processing fold...
bureau auc: 0.5904613236431544 fold done in 18.12 s
[level 1] processing fold...
bureau auc: 0.5917033221512507 fold done in 17.93 s
[level 1] processing fold...
bureau auc: 0.5939954214352768 fold done in 20.11 s
{'cv_score_mean': 0.5913350601776617, 'cv_score_std': 0.0014351554414164653, 'cv_score_min': 0.5902535589116454, 'cv_score_max': 0.5939954214352768}
result['pred_test'] : (282810,)
result['pred_val'] : (1130891,)
(1413701,)


In [29]:
new_feature2.shape

(307511, 10)

In [30]:
#new_feature2.to_csv('pre_application_new_feature-2.csv', index = False)

### INSTALLMENTS

In [31]:
installments = pd.read_csv('/kaggle/input/dseb-63-data-preparation-final-project/dseb63_final_project_DP_dataset/dseb63_final_project_DP_dataset/dseb63_installments_payments.csv')

In [32]:
installments

Unnamed: 0,SK_ID_PREV,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,SK_ID_CURR
0,1054186,1.0,6,-1180.0,-1187.0,6948.360,6948.360,147397.0
1,2452854,1.0,21,-546.0,-552.0,11302.605,11302.605,147397.0
2,1054186,1.0,2,-1300.0,-1307.0,6948.360,6948.360,147397.0
3,1682318,1.0,2,-240.0,-243.0,7374.510,7374.510,147397.0
4,2452854,1.0,10,-876.0,-882.0,11302.605,11302.605,147397.0
...,...,...,...,...,...,...,...,...
7744753,2192667,1.0,6,-2352.0,-2352.0,5322.240,5322.240,21216.0
7744754,2208281,1.0,4,-452.0,-466.0,63195.435,63195.435,21216.0
7744755,2657771,0.0,3,-2907.0,-2932.0,3375.000,3375.000,21216.0
7744756,2657771,0.0,47,-1871.0,-1871.0,4915.890,4915.890,21216.0


In [33]:
new_feature3 = predict_target(installments, 'INSTALLMENTS_')

TARGET
0.0    5716893
NaN    1554011
1.0     473853
Name: count, dtype: int64
[level 1] processing fold...
bureau auc: 0.5501204279518236 fold done in 38.15 s
[level 1] processing fold...
bureau auc: 0.5513056276339946 fold done in 38.63 s
[level 1] processing fold...
bureau auc: 0.5511696654141661 fold done in 39.00 s
[level 1] processing fold...
bureau auc: 0.5506687501451848 fold done in 42.03 s
[level 1] processing fold...
bureau auc: 0.5500334350588718 fold done in 39.07 s
{'cv_score_mean': 0.5506595812408082, 'cv_score_std': 0.0005216111933314505, 'cv_score_min': 0.5500334350588718, 'cv_score_max': 0.5513056276339946}
result['pred_test'] : (1554011,)
result['pred_val'] : (6190746,)
(7744757,)


In [34]:
new_feature3['SK_ID_CURR'] = new_feature3['SK_ID_CURR'].astype('int')

In [35]:
#new_feature3.to_csv('installments_new_feature-2.csv', index = False)

In [36]:
new_feature3.shape

(307511, 10)

### POS_CASH

In [37]:
pos_cash = pd.read_csv('/kaggle/input/dseb-63-data-preparation-final-project/dseb63_final_project_DP_dataset/dseb63_final_project_DP_dataset/dseb63_POS_CASH_balance.csv')

In [38]:
pos_cash

Unnamed: 0,SK_ID_PREV,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,SK_ID_CURR
0,1803195,-31,48.0,45.0,Active,0,0,185279
1,1803195,-17,48.0,31.0,Active,0,0,185279
2,1803195,-21,48.0,35.0,Active,0,0,185279
3,1803195,-8,48.0,21.0,Active,0,0,185279
4,1803195,-4,48.0,17.0,Active,0,0,185279
...,...,...,...,...,...,...,...,...
8543370,2340627,-2,,,Signed,0,0,284649
8543371,1011796,-2,6.0,6.0,Active,0,0,69172
8543372,1298851,-1,12.0,12.0,Active,0,0,151122
8543373,1550592,-1,1.0,0.0,Completed,0,0,56549


In [39]:
new_feature4 = predict_target(pos_cash, 'POS_CASH')

TARGET
0.0    6331185
NaN    1709738
1.0     502452
Name: count, dtype: int64
[level 1] processing fold...
bureau auc: 0.5279894578307726 fold done in 58.39 s
[level 1] processing fold...
bureau auc: 0.5288175547559106 fold done in 51.99 s
[level 1] processing fold...
bureau auc: 0.5273202492886853 fold done in 58.28 s
[level 1] processing fold...
bureau auc: 0.5281356679307329 fold done in 59.38 s
[level 1] processing fold...
bureau auc: 0.5267364825817561 fold done in 58.50 s
{'cv_score_mean': 0.5277998824775715, 'cv_score_std': 0.0007134724781654086, 'cv_score_min': 0.5267364825817561, 'cv_score_max': 0.5288175547559106}
result['pred_test'] : (1709738,)
result['pred_val'] : (6833637,)
(8543375,)


In [40]:
#new_feature4.to_csv('pos_cash_new_feature-2.csv', index = False)

In [41]:
new_feature4.shape

(307511, 10)

### CREDIT CARD

In [42]:
credit_card = pd.read_csv('/kaggle/input/dseb-63-data-preparation-final-project/dseb63_final_project_DP_dataset/dseb63_final_project_DP_dataset/dseb63_credit_card_balance.csv')

In [43]:
new_feature5 = predict_target(credit_card, 'CREDIT_CARD')

TARGET
0.0    2400480
NaN     640453
1.0     187032
Name: count, dtype: int64
[level 1] processing fold...
bureau auc: 0.5677822839496403 fold done in 168.52 s
[level 1] processing fold...
bureau auc: 0.565382615459692 fold done in 37.22 s
[level 1] processing fold...
bureau auc: 0.567010536342891 fold done in 164.64 s
[level 1] processing fold...
bureau auc: 0.5689183408016217 fold done in 38.91 s
[level 1] processing fold...
bureau auc: 0.5688487149408414 fold done in 107.39 s
{'cv_score_mean': 0.5675884982989373, 'cv_score_std': 0.001311040736820188, 'cv_score_min': 0.565382615459692, 'cv_score_max': 0.5689183408016217}
result['pred_test'] : (640453,)
result['pred_val'] : (2587512,)
(3227965,)


In [44]:
#new_feature5.to_csv('credit_card_new_feature-2.csv', index = False)

In [45]:
new_feature5.shape

(307511, 10)

### MERGE

In [46]:
extra_feat = new_feature1.merge(new_feature2, on = 'SK_ID_CURR', how = 'left')
extra_feat = extra_feat.merge(new_feature3, on = 'SK_ID_CURR', how = 'left')
extra_feat = extra_feat.merge(new_feature4, on = 'SK_ID_CURR', how = 'left')
extra_feat = extra_feat.merge(new_feature5, on = 'SK_ID_CURR', how = 'left')

In [47]:
extra_feat

Unnamed: 0,SK_ID_CURR,BUREAU_PRED_TARGET_MAX,BUREAU_PRED_TARGET_MIN,BUREAU_PRED_TARGET_MEDIAN,BUREAU_PRED_TARGET_MEAN,BUREAU_PRED_TARGET_STD,BUREAU_PRED_TARGET_SIZE,BUREAU_PRED_TARGET_SUM,BUREAU_PRED_TARGET_SKEW,BUREAU_PRED_TARGET_KURT,...,POS_CASHPRED_TARGET_KURT,CREDIT_CARDPRED_TARGET_MAX,CREDIT_CARDPRED_TARGET_MIN,CREDIT_CARDPRED_TARGET_MEDIAN,CREDIT_CARDPRED_TARGET_MEAN,CREDIT_CARDPRED_TARGET_STD,CREDIT_CARDPRED_TARGET_SIZE,CREDIT_CARDPRED_TARGET_SUM,CREDIT_CARDPRED_TARGET_SKEW,CREDIT_CARDPRED_TARGET_KURT
0,0,0.104702,0.104702,0.104702,0.104702,,1,0.104702,,,...,-0.807765,0.093286,0.078397,0.079288,0.081064,0.005007,8,0.648512,2.684256,7.344561
1,1,0.057289,0.057289,0.057289,0.057289,,1,0.057289,,,...,-1.960876,0.120474,0.076763,0.108487,0.099781,0.016816,9,0.898025,-0.617925,-1.522168
2,2,0.108553,0.047776,0.089058,0.086027,0.022989,6,0.516164,-0.877784,0.411802,...,-1.287527,,,,,,1,0.000000,,
3,3,0.107628,0.049150,0.083469,0.082163,0.023482,7,0.575141,-0.641417,-1.308238,...,-0.668601,0.154151,0.076885,0.107485,0.107984,0.018188,11,1.187828,1.370123,5.025904
4,4,0.096046,0.072072,0.081767,0.083464,0.011169,6,0.500786,0.170373,-2.795659,...,-1.447188,,,,,,1,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,307506,0.074391,0.068642,0.071433,0.071475,0.003191,4,0.285899,0.006413,-5.943142,...,0.356326,,,,,,1,0.000000,,
307507,307507,,,,,,1,0.000000,,,...,,,,,,,1,0.000000,,
307508,307508,0.145779,0.066487,0.133061,0.115117,0.031376,7,0.805817,-0.644021,-1.489377,...,1.913144,0.068391,0.060810,0.065413,0.065338,0.002030,18,1.176089,-0.408746,-0.204616
307509,307509,0.110245,0.048925,0.079585,0.079585,0.043360,2,0.159170,,,...,-0.099494,0.088112,0.050035,0.068902,0.068589,0.011409,96,6.584507,-0.067173,-1.161468


In [48]:
extra_feat.to_csv('extra_feat-final.csv', index = False)