In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
api = KaggleApi()
api.authenticate()

dataset = "kabure/german-credit-data-with-risk"
api.dataset_download_files(dataset, path=".", unzip=True)

In [None]:
df = pd.read_csv("german_credit_data.csv", index_col=0)
df.head()

In [None]:
# Standardize column names: strip spaces, replace special characters, and convert to lowercase
df.columns = (
    df.columns.str.strip()  # Remove leading/trailing spaces
    .str.lower()  # Convert to lowercase
    .str.replace(r"[^a-z0-9\s]", "", regex=True)  # Remove special characters
    .str.replace("\s+", "_", regex=True)  # Replace spaces with underscores
)

# Display updated column names
df.columns

In [None]:
# Instantiate the LabelEncoder
label_encoder = LabelEncoder()

# Encode the target column
df["risk"] = label_encoder.fit_transform(df["risk"])

# Display the transformed target column
df["risk"].head()

In [None]:
df_dummy = pd.get_dummies(df.drop("risk",axis=1))
df_dummy.head()

In [None]:
df_dummy.columns = (
    df_dummy.columns.str.strip()  # Remove leading/trailing spaces
    .str.lower()  # Convert to lowercase
    .str.replace(r"[^a-z0-9_\s]", "", regex=True)  # Allow underscores to remain
    .str.replace("\s+", "_", regex=True)  # Replace spaces with underscores
)

# Display updated column names
df_dummy.columns


In [None]:
# Separate features and target variable
X = df_dummy  # Features
y = df.risk  # Target labels (wine type)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Create and train a Random Forest Classifier
model = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    bootstrap=True,
    oob_score=True,
    random_state=42,
)
model.fit(X_train, y_train)

# Make predictions and evaluate accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(
    y_test, y_pred, target_names=label_encoder.inverse_transform([0, 1])
)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

In [None]:
importances = model.feature_importances_

# Print feature importances (optional)
for feature, importance in zip(df_dummy.columns, importances):
    print(f"{feature}: {importance:.2f}")

In [None]:
# Convert feature importances into a DataFrame
feature_importance_df = pd.DataFrame({
    "Feature": df_dummy.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.bar(feature_importance_df["Feature"], feature_importance_df["Importance"], color="skyblue")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.title("Feature Importances")
plt.xticks(rotation=45, ha="right")  # Rotate feature names for readability
plt.tight_layout()
plt.show()