In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

In [23]:
# Load dataset (replace with actual data)
df = pd.read_csv('german_credit_train.csv')

In [24]:
df[['Risk', 'EmploymentDuration']].groupby('EmploymentDuration').value_counts()

EmploymentDuration  Risk   
1_to_4              No Risk    878
                    Risk       272
4_to_7              No Risk    644
                    Risk       468
greater_7           Risk       520
                    No Risk    218
less_1              No Risk    693
                    Risk        63
unemployed          No Risk    236
                    Risk         7
Name: count, dtype: int64

In [35]:
def create_features(df):

    df_fe = df.copy()

    # 1. High Installment Percentage → risky if ≥ 4
    df_fe['is_high_installment'] = (df_fe['InstallmentPercent'] >= 4).astype(int)

    # 2. Young borrower → risky if Age < 40
    df_fe['is_young'] = (df_fe['Age'] < 40).astype(int)

    # 3. Long loan duration → risky if ≥ 30 months
    df_fe['is_long_loan'] = (df_fe['LoanDuration'] >= 30).astype(int)

    # 4. Large loan amount → risky if ≥ 5000
    df_fe['is_large_loan'] = (df_fe['LoanAmount'] >= 5000).astype(int)

    # 5. Long residence duation → risky if ≥ 4 years
    df_fe['is_long_residence'] = (df_fe['CurrentResidenceDuration'] >= 4).astype(int)

    # 6. No checking account
    df_fe['no_checking'] = (df_fe['CheckingStatus'] == 'no_checking').astype(int)

    # 7. Poor credit history
    df_fe['bad_credit'] = df_fe['CreditHistory'].isin([
        'outstanding_credit', 'prior_payments_delayed'
    ]).astype(int)

    # 8. High risk loan purpose
    df_fe['risky_purpose'] = df_fe['LoanPurpose'].isin([
        'appliances', 'business', 'education', 'other', 'ratio_tv', 'repairs'
    ]).astype(int)

    # 9. Employment instability → less than 1 year or unemployed
    df_fe['long_employment'] = df_fe['EmploymentDuration'].isin([
        'greater_7', '4_to_7'
    ]).astype(int)

    return df_fe

In [26]:
numerical_cols = [
    'LoanDuration',
    'LoanAmount',
    'InstallmentPercent',
    'CurrentResidenceDuration',
    'Age',
    'ExistingCreditsCount',
]
categorical_cols = [
  'CheckingStatus',
  'CreditHistory',
 'LoanPurpose',
 'ExistingSavings',
 'EmploymentDuration',
 'Sex',
 'OthersOnLoan',
 'OwnsProperty',
 'InstallmentPlans',
 'Housing',
 'Job',
 'Dependents',
 'Telephone',
 'ForeignWorker',]

binary_cols = [
    'is_high_installment',
    'is_young',
    'is_long_loan',
    'is_large_loan',
    'is_long_residence',
    'no_checking',
    'bad_credit',
    'long_employment',
    'risky_purpose'
]

In [27]:

# Identify categorical and numerical columns
cat_cols = categorical_cols
num_cols = numerical_cols

# Preprocessing for numerical data
num_transformer = Pipeline([
    #('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
cat_transformer = Pipeline([
    #('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols),
    ('binary', 'passthrough', binary_cols)
])

# Choose model type (classification or regression)
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', )

# Create full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])


In [41]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer


def compute_costs(LoanAmount):
     return({'Risk_No Risk': 5.0 + .6 * LoanAmount, 'No Risk_No Risk': 1.0 - .05 * LoanAmount,
         'Risk_Risk': 1.0, 'No Risk_Risk': 1.0})
def custom_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
   '''
   A custom metric for the German credit dataset
   '''
   real_prop = {'Risk': .02, 'No Risk': .98}
   train_prop = {'Risk': 1/3, 'No Risk': 2/3}
   custom_weight = {'Risk': real_prop['Risk']/train_prop['Risk'], 'No Risk': real_prop['No Risk']/train_prop['No Risk']}
   costs = compute_costs(solution['LoanAmount'])
   y_true = solution['Risk']
   y_pred = submission['Risk']
   loss = (y_true=='Risk') * custom_weight['Risk'] *\
               ((y_pred=='Risk') * costs['Risk_Risk'] + (y_pred=='No Risk') * costs['Risk_No Risk']) +\
            (y_true=='No Risk') * custom_weight['No Risk'] *\
               ((y_pred=='Risk') * costs['No Risk_Risk'] + (y_pred=='No Risk') * costs['No Risk_No Risk'])
   # return loss.mean()
   return -np.mean(loss)


# Custom scorer that needs access to X
def cost_scorer(estimator, X, y_true):
    # Make predictions
    y_pred = estimator.predict(X)

    # Reconstruct `solution` and `submission` DataFrames
    solution = pd.DataFrame({
        'Risk': ['Risk' if val == 1 else 'No Risk' for val in y_true],
        'LoanAmount': X['LoanAmount'].values  # X must be the raw DataFrame
    })

    submission = pd.DataFrame({
        'Risk': ['Risk' if val == 1 else 'No Risk' for val in y_pred]
    })

    return custom_score(solution, submission, row_id_column_name=None)

# scorer = make_scorer(cost_scorer, greater_is_better=True)


In [None]:
# Define target and features
target = "Risk"
X = df.drop(columns=[target])
X = create_features(X)
y = df[target]

# Binary encode target variable if classification
y = LabelEncoder().fit_transform(y) if y.nunique() == 2 else y


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# #this can be a way to find the right balance.
# class_weights = {0: 0.8, 1: 0.2}
# sample_weights = [class_weights[label] for label in y_train]


# Define parameter grid for hyperparameter tuning
param_grid = {
    'model__n_estimators': [400, 500, 600, 700, 800],
    'model__max_depth': [2, 3, 5, 7],
    'model__learning_rate': [0.005, 0.01, 0.1, 0.2]
}


# Perform grid search - custom scoring loss function here?
search = GridSearchCV(pipeline, param_grid, cv=5, scoring=cost_scorer, n_jobs=-1)
search.fit(X_train, y_train,
        #    model__sample_weight=sample_weights
           )

In [None]:
# Evaluate model
y_pred = search.best_estimator_.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {score:.4f}")

# Best parameters
print("Best parameters:", search.best_params_)

Accuracy: 0.7825
Best parameters: {'model__learning_rate': 0.005, 'model__max_depth': 7, 'model__n_estimators': 400}


In [43]:

# Load the test dataset
try:
    test_df = pd.read_csv('german_credit_test.csv')
except FileNotFoundError:
    print("Error: 'german_credit_test.csv' not found. Please make sure the file exists and the name is correct.")
    exit() # Exit the script if the file is not found
except Exception as e:
    print(f"An error occurred while loading the test data: {e}")
    exit()

# Assuming you have already defined the following from the previous code block:
# numerical_cols, categorical_cols, target, preprocessor, model, pipeline, search (with the fitted model)

# Prepare the test data
X_test_final = test_df.drop(columns=[target], errors='ignore') # Drop target column if it exists
X_test_final = create_features(X_test_final)  # Create features

# Generate predictions
search.fit(X, y)  # Refit using the full dataset
y_pred_final = search.best_estimator_.predict(X_test_final)


# Optionally save predictions to a file
predictions_df = pd.DataFrame({
    'Id': test_df.index,
    'Risk': y_pred_final}
                              )
# predictions_df.to_csv('german_credit_test_predictions.csv', index=False)


In [44]:
# prompt: Id,TARGET
# 1, "Risk"
# 2, "No Risk"
# 3, "Risk"
# etc.
# is the format of the predictions df that i need.

import pandas as pd
# ... (Your existing code)

# Inverse transform the predictions if necessary
le = LabelEncoder()
le.fit(df['Risk'])
y_pred_final_labels = le.inverse_transform(y_pred_final)


# Optionally save predictions to a file
predictions_df = pd.DataFrame({
    'Id': test_df.index,
    'Risk': y_pred_final_labels # Use 'TARGET' as column name
})
#print(predictions_df['TARGET'].value_counts())
predictions_df.to_csv('german_credit_test_predictions.csv', index=False)
