### Loading Data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.io import arff
from pathlib import Path

# --- File paths ---
data_dir = Path().resolve()
file1 = data_dir / "analcatdata_creditscore.arff"
file2 = data_dir / "dataset_54_vehicle.arff"

# --- Load ARFF datasets ---
def load_arff_to_df(file_path):
    data, meta = arff.loadarff(file_path)
    df = pd.DataFrame(data)
    # decode byte strings if needed
    for col in df.select_dtypes([object]).columns:
        df[col] = df[col].apply(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)
    return df

df_credit = load_arff_to_df(file1)
df_vehicle = load_arff_to_df(file2)

print("Datasets loaded successfully.")

### Data exploration and preprocessing

(missing values, outliers, scaling, encoding, etc.)

#### Credit Score Dataset Overview
Entries:    100  
Columns:    7  
Response variable: "Application.accepted"  
Missing values: No  
Data types: float, object (binary)

| Column | Type | Observations |
| ----------- | ----------- | ----------- |
| Age | numeric | Range 20-55, mean ~32 |
| Income.per.dependent | numeric | 
| Monthly.credit.card.exp | numeric | Range 0-1898, mean~189 |
| Own.home | categorical | binary(0,1); 64 yes, 36 no |
| Self.employed | categorical | binary(0,1); 95 no, 5 yes |
| Derogatory.reports | categorical | range 0-7, 82 no |
| **Application.accepted** | categorical | binary(0,1); 73 yes, 27 no |

Considerations for Pre-Processing: 
- The categorical values are currently stored as String ('0', '1')
- The numeric values should be scaled. 
- Self.employed, Derogatory.reports and Application.accepted are skewed
- Monthly.credit.card.exp already shows outliers (max 1898 vs mean 189)
- There are no missing values 

In [None]:
# --- Credit Score Dataset Overview ---

display(df_credit.info())       # 100 entries, 7 columns; dtypes: float64(3), object(4)
print(df_credit.isnull().sum())
display(df_credit.head())
print(df_credit.describe(include='all').transpose())

In [None]:
# Convert strings to integers 
string_cols = ["Own.home", "Self.employed", "Application.accepted", "Derogatory.reports"]

for col in string_cols:
    df_credit[col] = pd.to_numeric(df_credit[col], errors="coerce").astype("Int64")

# # Verify conversion
# for col in string_cols:
#     print(f"{col}: {df_credit[col].unique()}")


In [None]:
# ---- Visualization of categorical features distribution ---
cols_to_plot = ["Own.home", "Self.employed", "Application.accepted"]
colors = {0: "#ff814f", 1: "#66c2a5"}  

counts = {col: df_credit[col].value_counts().sort_index() for col in cols_to_plot}
counts_df = pd.DataFrame(counts).T.fillna(0).astype(int)

fig, ax = plt.subplots(figsize=(6, 4))
counts_df.plot(
    kind="bar",
    stacked=True,
    color=colors,
    ax=ax,
    edgecolor="black"
)

ax.set_ylabel("Count")
ax.set_xlabel("")
ax.set_title("Distribution of Categorical Features")
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, ha="center")       # rotate x-axis labels
ax.legend(title="Value", loc="upper right")

for container in ax.containers:                                         # count labels
    ax.bar_label(container, label_type='center', color="white", fontsize=8)

plt.tight_layout()
plt.show()


In [None]:
# --- Correlation Heatmap ---
numeric_df = df_credit.select_dtypes(include=[np.number])

plt.figure(figsize=(6, 5))
corr = numeric_df.corr()
sns.heatmap(
    corr, 
    annot=True, 
    cmap="coolwarm", 
    fmt=".2f", 
    square=True)
plt.title("Feature Correlation Matrix")
plt.show()

# --- Pairplot ---
pairplot_features = ["Age", "Income.per.dependent", "Monthly.credit.card.exp", "Derogatory.reports"]
sns.pairplot(
    df_credit[pairplot_features + ["Application.accepted"]], 
    hue="Application.accepted", 
    diag_kind="kde", 
    palette={1: "#66c2a5", 0: "#fc8d62"} )
plt.show()



#### Vehicle Dataset Overview
Entries: 846  
Columns: 19  
Response variable: "class"  
Missing values: No  
Data types: float, object (String)

In [None]:
# --- Vehicle Dataset Overview ---
display(df_vehicle.info())   # 846 entries, 19 columns; dtypes: float64(18), object(1)
print(df_vehicle.isnull().sum())
display(df_vehicle.head())   
(df_vehicle.describe(include='all').transpose())


### Classification

Carry out the classification:
- Run classifiers, and Experiment with:
    - Different classifiers and your datasets
    - Different parameter settings (= several results per classifier per dataset, not only
random/best)

In [None]:
# Model 0: Logistic Regression model with all predictors and no scaling/data cleaning beforehand
# Later to be used as comparison

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X = df_credit.drop(columns=["Application.accepted"])        # predictors
y = df_credit["Application.accepted"]                       # response

# Split data: 2/3 train, 1/3 test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=1/3,       
    random_state=42,     # seed
    stratify=y           
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Fit model
model_logreg = LogisticRegression(max_iter=1000, random_state=42)
model_logreg.fit(X_train, y_train)

# Predict
y_pred = model_logreg.predict(X_test)
y_prob = model_logreg.predict_proba(X_test)[:, 1]  # probability of class 1

print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=3))


##### Model 0 report: 

The model has 100% accuracy which means there have to be issues with the data. This is also visible in the confusion matrix: The model only predicted true negatives (9) and true positives (25), but neither false negatives nor false positives.

The likely reason is that small size of the data set. Some variables (like derogatory.reports) have such a strong correlation with the response variable that they are essentially deterministic predictors - the model only needs to know these variables to make a correct prediction about whether an application was accepted or not.

Looking at the mean values of accepted and rejected applications we can see that the monthly.credit.card.exp for rejected applications is *0* and the derogatory reports vary wiledly (0.1 vs 1+).

Looking at the two suspicious variables further it becomes clear that the monthly.credit.card.exp is a very strong indicator for whether the application gets accepted or not - *all* 27 rejected applications have a value of 0. 

In [None]:
df_credit.groupby("Application.accepted").mean()

In [None]:
df_credit.groupby("Application.accepted")[["Monthly.credit.card.exp", "Derogatory.reports"]].describe()

### Performance

Evaluate and analyse the performance (primarily effectiveness, but also provide basic
details on efficiency):
- Choose suitable, multiple performance measures
- Make valid comparisons (among the classifiers, across your datasets, parameters,
preprocessing effects...)
- (How) can you improve the results?
- Can you identify any patterns/trends?
    - Which methods work well and which did not, is there e.g. one method
outperforming the others on all datasets?
    - How do the results change when preprocessing strategies change? How sensitive
is an algorithm to parameter settings?
    - Are there differences across the datasets? Design your experiments so that you
can investigate the influence of single parameters

In [None]:
X_train.dtypes

### Holdout vs Cross-Validation

- Pay attention to your splits and settings
Are there differences? Why? In which metrics? What could have caused it?
- Compare/document changes in runtime behaviour with the changing e.g. dataset size

### Summary