In [2]:
!unzip dsb-24-german-credit.zip

Archive:  dsb-24-german-credit.zip
  inflating: german_credit_test.csv  
  inflating: german_credit_test_submission.csv  
  inflating: german_credit_train.csv  


In [3]:

import pandas as pd
df = pd.read_csv('german_credit_train.csv')

In [4]:
numerical_cols = [
    'LoanDuration',
    'LoanAmount',
    'InstallmentPercent',
    'CurrentResidenceDuration',
    'Age',
    'ExistingCreditsCount',
]
categorical_cols = [
  'CheckingStatus',
  'CreditHistory',
 'LoanPurpose',
 'ExistingSavings',
 'EmploymentDuration',
 'Sex',
 'OthersOnLoan',
 'OwnsProperty',
 'InstallmentPlans',
 'Housing',
 'Job',
 'Dependents',
 'Telephone',
 'ForeignWorker',]

In [10]:
from sklearn.experimental import enable_halving_search_cv

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingGridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, StackingClassifier


# Load dataset (replace with actual data)

# Define target and features
target = "Risk"
X = df.drop(columns=[target])
y = df[target]

# Binary encode target variable if classification
y = LabelEncoder().fit_transform(y) if y.nunique() == 2 else y

# Identify categorical and numerical columns
cat_cols = categorical_cols
num_cols = numerical_cols

# Preprocessing for numerical data
num_transformer = Pipeline([
    #('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
cat_transformer = Pipeline([
    #('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])


In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5))
]

# Define meta-model
meta_model = LogisticRegression()

# Create Stacking Classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)


In [18]:
# Create Pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),  # Feature scaling
    ('stacking', stacking_clf)     # Stacking classifier
])


In [13]:
import os

# Define hyperparameter grid
os.environ["XGB_CUDA"] = "1"  # Force XGBoost to use GPU


In [84]:


# Define expanded hyperparameter grid
param_grid = {
    'stacking__rf__n_estimators': [50, 100, 500],
    #'stacking__rf__max_depth': [None, 10, 20],
    'stacking__rf__min_samples_split': [2, 5, 10],
    #'stacking__rf__min_samples_leaf': [1, 2, 4],

    'stacking__xgb__n_estimators': [50, 100, 500],
    #'stacking__xgb__max_depth': [3, 5, 10],
    'stacking__xgb__learning_rate': [0.01, 0.1, 0.2],
    #'stacking__xgb__subsample': [0.6, 0.8, 1.0],
    #'stacking__xgb__colsample_bytree': [0.6, 0.8, 1.0],

    'stacking__knn__n_neighbors': [3, 7],
    #'stacking__knn__weights': ['uniform', 'distance'],
    'stacking__knn__metric': ['euclidean', 'manhattan', 'minkowski']
}


# Grid Search with 3-fold cross-validation
grid_search = HalvingGridSearchCV(pipeline, param_grid, factor=2, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X, y)

# Best model evaluation
best_model = grid_search.best_estimator_

n_iterations: 9
n_required_iterations: 9
n_possible_iterations: 9
min_resources_: 15
max_resources_: 3999
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 486
n_resources: 15
Fitting 3 folds for each of 486 candidates, totalling 1458 fits
----------
iter: 1
n_candidates: 243
n_resources: 30
Fitting 3 folds for each of 243 candidates, totalling 729 fits
----------
iter: 2
n_candidates: 122
n_resources: 60
Fitting 3 folds for each of 122 candidates, totalling 366 fits
----------
iter: 3
n_candidates: 61
n_resources: 120
Fitting 3 folds for each of 61 candidates, totalling 183 fits
----------
iter: 4
n_candidates: 31
n_resources: 240
Fitting 3 folds for each of 31 candidates, totalling 93 fits
----------
iter: 5
n_candidates: 16
n_resources: 480
Fitting 3 folds for each of 16 candidates, totalling 48 fits
----------
iter: 6
n_candidates: 8
n_resources: 960
Fitting 3 folds for each of 8 candidates, totalling 24 fits
----------
iter: 7
n_candidates: 4
n_resources: 19

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [19]:
#pipeline.fit(X,y)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [85]:
# prompt: open the german credit test.csv, and use the above gridsearch model + preprocessing pipeline to generate predictions the test.csv

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, mean_squared_error


# Load the test dataset
test_df = pd.read_csv('german_credit_test.csv')

# Assuming you have already defined the following from the previous code block:
# numerical_cols, categorical_cols, target, preprocessor, model, pipeline, search (with the fitted model)

# Prepare the test data
X_test_final = test_df.drop(columns=[target], errors='ignore') # Drop target column if it exists

# Generate predictions
y_pred_final = best_model.predict(X_test_final)


# Optionally save predictions to a file
predictions_df = pd.DataFrame({
    'Id': test_df.index,
    'Risk': y_pred_final}
                              )
predictions_df.to_csv('german_credit_test_predictions.csv', index=False)


In [86]:
predictions_df.Risk.value_counts()

Unnamed: 0_level_0,count
Risk,Unnamed: 1_level_1
0,755
1,246


In [87]:
import pandas as pd
# ... (Your existing code)

# Inverse transform the predictions if necessary
le = LabelEncoder()
le.fit(df['Risk'])
y_pred_final_labels = le.inverse_transform(y_pred_final)


# Optionally save predictions to a file
predictions_df = pd.DataFrame({
    'Id': test_df.index,
    'Risk': y_pred_final_labels # Use 'TARGET' as column name
})
#print(predictions_df['TARGET'].value_counts())
predictions_df.to_csv('german_credit_test_predictions.csv', index=False)


In [88]:
predictions_df.Risk.value_counts()

Unnamed: 0_level_0,count
Risk,Unnamed: 1_level_1
No Risk,755
Risk,246


In [89]:
from google.colab import files
files.download("german_credit_test_predictions.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
#can we use the probability predictions, and then do some type of estimated value?

#get the predicted probs, get loan amounts.
#payoff =
#(prob(1) * loss if 1 wrong + gain if 1 right) vs (prob(0) * loss if 0 wrong + gain if 0 right), then pick the one with the greatest value
#this should balance the confidence automatically?

1 = risk, 0 = no risk

In [31]:
real_prop = {'Risk': .02, 'No Risk': .98}
train_prop = {'Risk': 1/3, 'No Risk': 2/3}
custom_weight = {'Risk': real_prop['Risk']/train_prop['Risk'], 'No Risk': real_prop['No Risk']/train_prop['No Risk']}

In [32]:
custom_weight

{'Risk': 0.060000000000000005, 'No Risk': 1.47}

In [None]:
custom_weight

In [34]:
def compute_costs(LoanAmount):
     return({'Risk_No Risk': 5.0 + .6 * LoanAmount, 'No Risk_No Risk': 1.0 - .05 * LoanAmount,
         'Risk_Risk': 1.0, 'No Risk_Risk': 1.0})

In [79]:
def custom_preds(prob_1s, LoanAmount):

  preds = []
  for i in range(len(prob_1s)):
    #expected value of 1, when No Risk:
    expected_val_1 = prob_1s[i]*(custom_weight['No Risk']*(5.0 + .6 * LoanAmount[i]) + custom_weight['Risk']*1)
    expected_val_0 = (1-prob_1s[i])*(custom_weight['No Risk']*(0.05 * LoanAmount[i]) + custom_weight['Risk']*1)

    #print(expected_val_0, expected_val_1)

    if expected_val_1 > expected_val_0:
      preds.append(0)
    else:
      preds.append(1)

  return preds

In [80]:
probs_1s = pipeline.predict_proba(X_test_final)[:, 1].flatten()

In [81]:
y_pred_final = custom_preds(probs_1s, test_df['LoanAmount'])

In [82]:
y_pred_final = custom_preds(probs_1s, test_df['LoanAmount'])
# Inverse transform the predictions if necessary
le = LabelEncoder()
le.fit(df['Risk'])
y_pred_final_labels = le.inverse_transform(y_pred_final)


# Optionally save predictions to a file
predictions_df = pd.DataFrame({
    'Id': test_df.index,
    'Risk': y_pred_final_labels # Use 'TARGET' as column name
})
#print(predictions_df['TARGET'].value_counts())
predictions_df.to_csv('german_credit_test_predictions.csv', index=False)

In [83]:
predictions_df.Risk.value_counts()

Unnamed: 0_level_0,count
Risk,Unnamed: 1_level_1
No Risk,836
Risk,165
