In [1]:
!pip install --upgrade scikit-learn
!pip install torch
!pip install imblearn
!pip install catboost
!pip install xgboost

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.0
    Uninstalling scikit-learn-1.6.0:
      Successfully uninstalled scikit-learn-1.6.0
Successfully installed scikit-learn-1.6.1
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0
Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86

In [23]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score

In [24]:
# Load datasets
train_data = pd.read_csv("/content/Train.csv")  # Correct file
test_data = pd.read_csv("/content/Test.csv")  # Correct file
economic_data = pd.read_csv("/content/economic_indicators.csv")

In [25]:
# Feature Engineering
train_data['loan_to_repay_ratio'] = train_data['Total_Amount'] / (train_data['Total_Amount_to_Repay'] + 1)
test_data['loan_to_repay_ratio'] = test_data['Total_Amount'] / (test_data['Total_Amount_to_Repay'] + 1)

In [26]:
# Merge Economic Indicators
if 'country_id' in train_data.columns and 'country_id' in economic_data.columns:
    train_data = train_data.merge(economic_data, on='country_id', how='left')
    test_data = test_data.merge(economic_data, on='country_id', how='left')

In [27]:
# Handle Missing Values
train_data.fillna(train_data.median(numeric_only=True), inplace=True)
test_data.fillna(test_data.median(numeric_only=True), inplace=True)

In [7]:
# Interaction Features
train_data['duration_ratio'] = train_data['Total_Amount'] / (train_data['duration'] + 1)
test_data['duration_ratio'] = test_data['Total_Amount'] / (test_data['duration'] + 1)

In [8]:
# Merge Economic Indicators if 'country_id' exists
if 'country_id' in train_data.columns and 'country_id' in economic_data.columns:
    train_data = train_data.merge(economic_data, on='country_id', how='left')
    test_data = test_data.merge(economic_data, on='country_id', how='left')
else:
    print("Warning: 'country_id' column not found in one or both datasets. Skipping merge.")



In [9]:
# Handle Missing Values
train_data.fillna(train_data.median(numeric_only=True), inplace=True)
test_data.fillna(test_data.median(numeric_only=True), inplace=True)

In [10]:
# Prepare Features and Target
X_train_full = train_data.drop(columns=['ID', 'target'], errors='ignore')
y_train_full = train_data['target']

In [28]:
# Prepare Features and Target
X = train_data.drop(columns=['ID', 'target'])
y = train_data['target']

In [29]:
# Identify categorical and numerical columns
categorical_cols = [col for col in ['country_id', 'disbursement_date'] if col in train_data.columns]
numerical_cols = ['Total_Amount', 'Total_Amount_to_Repay', 'loan_to_repay_ratio']

In [30]:
# Preprocessing Pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
])

X_processed = preprocessor.fit_transform(X)



In [31]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)

In [32]:
# Handle Imbalanced Data using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [33]:
# Class Weights Calculation
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(enumerate(class_weights))

In [34]:
# Define Deep Learning Model
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_resampled.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(1, activation='sigmoid')  # Binary classification
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [35]:
# Compile Model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [36]:
# Callbacks for Early Stopping and Learning Rate Reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)

In [37]:
# Train Model
history = model.fit(
    X_train_resampled, y_train_resampled,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=64,
    class_weight=class_weights_dict,
    callbacks=[early_stopping, reduce_lr]
)

Epoch 1/100
[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 9ms/step - accuracy: 0.6593 - loss: 3.0048 - val_accuracy: 0.3657 - val_loss: 1.7788 - learning_rate: 0.0010
Epoch 2/100
[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 11ms/step - accuracy: 0.8321 - loss: 0.3904 - val_accuracy: 0.7980 - val_loss: 0.7249 - learning_rate: 0.0010
Epoch 3/100
[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - accuracy: 0.9020 - loss: 0.2405 - val_accuracy: 0.4807 - val_loss: 1.2278 - learning_rate: 0.0010
Epoch 4/100
[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 9ms/step - accuracy: 0.9261 - loss: 0.1891 - val_accuracy: 0.8959 - val_loss: 0.3989 - learning_rate: 0.0010
Epoch 5/100
[1m1685/1685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 9ms/step - accuracy: 0.9408 - loss: 0.1559 - val_accuracy: 0.8159 - val_loss: 0.6083 - learning_rate: 0.0010
Epoch 6/100
[1m1685/1685[0m [32m━━━━━━━━━━━━━

In [38]:
# Evaluate Model
val_predictions = (model.predict(X_val) > 0.5).astype(int)
f1 = f1_score(y_val, val_predictions)
print(f"Deep Learning Model F1 Score: {f1}")

[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Deep Learning Model F1 Score: 0.5114254624591947


In [39]:
# Process Test Data
X_test = preprocessor.transform(test_data.drop(columns=['ID'], errors='ignore'))
test_predictions = (model.predict(X_test) > 0.5).astype(int)



[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step


In [40]:
# Generate Submission File
submission = pd.DataFrame({'ID': test_data['ID'], 'target': test_predictions.flatten()})
submission.to_csv('submission_deep_learning.csv', index=False)