In [2]:
!pip install --upgrade scikit-learn
!pip install torch
!pip install imblearn
!pip install catboost
!pip install xgboost

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m87.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.0
    Uninstalling scikit-learn-1.6.0:
      Successfully uninstalled scikit-learn-1.6.0
Successfully installed scikit-learn-1.6.1
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0
Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight

In [4]:
# Load datasets
train_data = pd.read_csv("/content/Train.csv")
test_data = pd.read_csv("/content/Test.csv")
economic_data = pd.read_csv("/content/economic_indicators.csv")


In [5]:
# Feature Engineering
train_data['loan_to_repay_ratio'] = train_data['Total_Amount'] / (train_data['Total_Amount_to_Repay'] + 1)
test_data['loan_to_repay_ratio'] = test_data['Total_Amount'] / (test_data['Total_Amount_to_Repay'] + 1)

train_data['amount_duration_interaction'] = train_data['Total_Amount'] * train_data['duration']
test_data['amount_duration_interaction'] = test_data['Total_Amount'] * test_data['duration']

In [6]:
# Merge Economic Indicators
if 'country_id' in train_data.columns and 'country_id' in economic_data.columns:
    train_data = train_data.merge(economic_data, on='country_id', how='left')
    test_data = test_data.merge(economic_data, on='country_id', how='left')

In [7]:
# Handle Missing Values
train_data.fillna(train_data.median(numeric_only=True), inplace=True)
test_data.fillna(test_data.median(numeric_only=True), inplace=True)


In [8]:
# Prepare Features and Target
X = train_data.drop(columns=['ID', 'target'])
y = train_data['target']

In [9]:
# Identify categorical and numerical columns
categorical_cols = [col for col in ['country_id', 'disbursement_date'] if col in train_data.columns]
numerical_cols = ['Total_Amount', 'Total_Amount_to_Repay', 'loan_to_repay_ratio', 'amount_duration_interaction']

In [10]:
# Preprocessing Pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
])

X_processed = preprocessor.fit_transform(X)

In [11]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)


In [12]:
# Handle Imbalanced Data using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [13]:
# Class Weights Calculation
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(enumerate(class_weights))

In [14]:
# Define Deep Learning Model
model = Sequential([
    Dense(512, kernel_regularizer=l2(0.001), input_shape=(X_train_resampled.shape[1],)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.4),

    Dense(256, kernel_regularizer=l2(0.001)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.4),

    Dense(128, kernel_regularizer=l2(0.001)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, kernel_regularizer=l2(0.001)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.3),

    Dense(1, activation='sigmoid')  # Binary classification
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
# Compile Model
model.compile(optimizer=tf.keras.optimizers.AdamW(learning_rate=0.0005),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])



In [17]:
# Callbacks for Early Stopping and Learning Rate Reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min') # Changed the file extension to .keras

In [18]:
# Train Model
history = model.fit(
    X_train_resampled, y_train_resampled,
    validation_data=(X_val, y_val),
    epochs=200,
    batch_size=64,
    class_weight=class_weights_dict,
    callbacks=[early_stopping, reduce_lr, model_checkpoint]
)


Epoch 1/200
[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 20ms/step - accuracy: 0.6597 - auc: 0.7680 - loss: 4.9103 - val_accuracy: 0.0629 - val_auc: 0.8918 - val_loss: 4.4351 - learning_rate: 5.0000e-04
Epoch 2/200
[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 20ms/step - accuracy: 0.7876 - auc: 0.9449 - loss: 1.2894 - val_accuracy: 0.0234 - val_auc: 0.9267 - val_loss: 3.7826 - learning_rate: 5.0000e-04
Epoch 3/200
[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 20ms/step - accuracy: 0.8689 - auc: 0.9735 - loss: 0.9733 - val_accuracy: 0.5749 - val_auc: 0.9548 - val_loss: 1.8045 - learning_rate: 5.0000e-04
Epoch 4/200
[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 21ms/step - accuracy: 0.8980 - auc: 0.9786 - loss: 0.7653 - val_accuracy: 0.7360 - val_auc: 0.9603 - val_loss: 1.2285 - learning_rate: 5.0000e-04
Epoch 5/200
[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 19ms/step -

In [20]:
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min') # Changed the file extension to .keras

In [21]:
# Load the best model
model.load_weights('best_model.keras') # Changed the file name to best_model.keras

In [22]:
# Evaluate Model
val_predictions_proba = model.predict(X_val)
val_predictions = (val_predictions_proba > 0.5).astype(int)
f1 = f1_score(y_val, val_predictions)
auc = roc_auc_score(y_val, val_predictions_proba)
print(f"Deep Learning Model F1 Score: {f1}")
print(f"Deep Learning Model AUC Score: {auc}")

[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
Deep Learning Model F1 Score: 0.4321796071094481
Deep Learning Model AUC Score: 0.9844493550814495


In [23]:
# Process Test Data
X_test = preprocessor.transform(test_data.drop(columns=['ID'], errors='ignore'))
test_predictions = (model.predict(X_test) > 0.5).astype(int)



[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step


In [24]:
# Generate Submission File
submission = pd.DataFrame({'ID': test_data['ID'], 'target': test_predictions.flatten()})
submission.to_csv('111.csv', index=False)