In [1]:
!unzip dsb-24-german-credit.zip

Archive:  dsb-24-german-credit.zip
  inflating: german_credit_test.csv  
  inflating: german_credit_test_submission.csv  
  inflating: german_credit_train.csv  


In [19]:
# prompt: open the german credit train csv file into a dataframe, and for each column, show me the unique values

import pandas as pd

# Assuming the CSV file is named 'german_credit_train.csv' after unzipping
try:
    df = pd.read_csv('german_credit_train.csv')
    for col in df.columns:
        print(f"Unique values in column '{col}':")
        print(df[col].unique())
        print(len(df[col].unique()))
        print("-" * 20)
except FileNotFoundError:
    print("Error: 'german_credit_train.csv' not found. Please make sure the file exists and the name is correct.")
except Exception as e:
    print(f"An error occurred: {e}")


Unique values in column 'CheckingStatus':
['0_to_200' 'less_0' 'no_checking' 'greater_200']
4
--------------------
Unique values in column 'LoanDuration':
[31 18 15 28 32 16 11 35  9 27 29  4 33 13 34 23 10 25 26  7 64  6 19 39
 17 38  8 30 22 20 14 12 50 41 36 24 37  5 43 48 46 21 44 40 45 53 42 61
 51 63 47 49 60 56 62 52 54]
57
--------------------
Unique values in column 'CreditHistory':
['credits_paid_to_date' 'prior_payments_delayed' 'outstanding_credit'
 'all_credits_paid_back' 'no_credits']
5
--------------------
Unique values in column 'LoanPurpose':
['other' 'car_new' 'furniture' 'retraining' 'education' 'vacation'
 'appliances' 'car_used' 'radio_tv' 'repairs' 'business']
11
--------------------
Unique values in column 'LoanAmount':
[1889  462  250 ... 6536 1597 4152]
2794
--------------------
Unique values in column 'ExistingSavings':
['100_to_500' 'less_100' '500_to_1000' 'unknown' 'greater_1000']
5
--------------------
Unique values in column 'EmploymentDuration':
['less_1

In [20]:
numerical_cols = [
    'LoanDuration',
    'LoanAmount',
    'InstallmentPercent',
    'CurrentResidenceDuration',
    'Age',
    'ExistingCreditsCount',
]
categorical_cols = [
  'CheckingStatus',
  'CreditHistory',
 'LoanPurpose',
 'ExistingSavings',
 'EmploymentDuration',
 'Sex',
 'OthersOnLoan',
 'OwnsProperty',
 'InstallmentPlans',
 'Housing',
 'Job',
 'Dependents',
 'Telephone',
 'ForeignWorker',]

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

# Load dataset (replace with actual data)

# Define target and features
target = "Risk"
X = df.drop(columns=[target])
y = df[target]

# Binary encode target variable if classification
y = LabelEncoder().fit_transform(y) if y.nunique() == 2 else y

# Identify categorical and numerical columns
cat_cols = categorical_cols
num_cols = numerical_cols

# Preprocessing for numerical data
num_transformer = Pipeline([
    #('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
cat_transformer = Pipeline([
    #('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

# Choose model type (classification or regression)
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', )

# Create full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#this can be a way to find the right balance.
class_weights = {0: 0.8, 1: 0.2}
sample_weights = [class_weights[label] for label in y_train]


# Define parameter grid for hyperparameter tuning
param_grid = {
    'model__n_estimators': [400, 500, 600, 700, 800],
    'model__max_depth': [2, 3, 5, 7],
    'model__learning_rate': [0.005, 0.01, 0.1, 0.2]
}

# Perform grid search - custom scoring loss function here?
search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
search.fit(X_train, y_train,
           model__sample_weight=sample_weights
           )

# Evaluate model
y_pred = search.best_estimator_.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {score:.4f}")

# Best parameters
print("Best parameters:", search.best_params_)


Accuracy: 0.7575
Best parameters: {'model__learning_rate': 0.2, 'model__max_depth': 2, 'model__n_estimators': 700}


In [None]:
# prompt: open the german credit test.csv, and use the above gridsearch model + preprocessing pipeline to generate predictions the test.csv

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, mean_squared_error


# Load the test dataset
try:
    test_df = pd.read_csv('german_credit_test.csv')
except FileNotFoundError:
    print("Error: 'german_credit_test.csv' not found. Please make sure the file exists and the name is correct.")
    exit() # Exit the script if the file is not found
except Exception as e:
    print(f"An error occurred while loading the test data: {e}")
    exit()

# Assuming you have already defined the following from the previous code block:
# numerical_cols, categorical_cols, target, preprocessor, model, pipeline, search (with the fitted model)

# Prepare the test data
X_test_final = test_df.drop(columns=[target], errors='ignore') # Drop target column if it exists

# Generate predictions
y_pred_final = search.best_estimator_.predict(X_test_final)


# Optionally save predictions to a file
predictions_df = pd.DataFrame({
    'Id': test_df.index,
    'Risk': y_pred_final}
                              )
# predictions_df.to_csv('german_credit_test_predictions.csv', index=False)


In [24]:
predictions_df.Risk.value_counts()

Risk
0    819
1    182
Name: count, dtype: int64

In [25]:
# prompt: Id,TARGET
# 1, "Risk"
# 2, "No Risk"
# 3, "Risk"
# etc.
# is the format of the predictions df that i need.

import pandas as pd
# ... (Your existing code)

# Inverse transform the predictions if necessary
le = LabelEncoder()
le.fit(df['Risk'])
y_pred_final_labels = le.inverse_transform(y_pred_final)


# Optionally save predictions to a file
predictions_df = pd.DataFrame({
    'Id': test_df.index,
    'Risk': y_pred_final_labels # Use 'TARGET' as column name
})
#print(predictions_df['TARGET'].value_counts())
predictions_df.to_csv('german_credit_test_predictions.csv', index=False)


In [26]:
predictions_df.Risk.value_counts()

Risk
No Risk    819
Risk       182
Name: count, dtype: int64