In [83]:
import pandas as pd

# Load dataset
df = pd.read_csv(r"D:\credit score\german_credit_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0,67,male,2,own,,little,1169,6,radio/TV
1,1,22,female,2,own,little,moderate,5951,48,radio/TV
2,2,49,male,1,own,little,,2096,12,education
3,3,45,male,2,free,little,little,7882,42,furniture/equipment
4,4,53,male,2,free,little,little,4870,24,car


In [84]:
# Show basic info (column names, types, missing values)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1000 non-null   int64 
 1   Age               1000 non-null   int64 
 2   Sex               1000 non-null   object
 3   Job               1000 non-null   int64 
 4   Housing           1000 non-null   object
 5   Saving accounts   817 non-null    object
 6   Checking account  606 non-null    object
 7   Credit amount     1000 non-null   int64 
 8   Duration          1000 non-null   int64 
 9   Purpose           1000 non-null   object
dtypes: int64(5), object(5)
memory usage: 78.3+ KB
None


In [85]:
# Summary statistics for numeric columns
print(df.describe())

        Unnamed: 0          Age          Job  Credit amount     Duration
count  1000.000000  1000.000000  1000.000000    1000.000000  1000.000000
mean    499.500000    35.546000     1.904000    3271.258000    20.903000
std     288.819436    11.375469     0.653614    2822.736876    12.058814
min       0.000000    19.000000     0.000000     250.000000     4.000000
25%     249.750000    27.000000     2.000000    1365.500000    12.000000
50%     499.500000    33.000000     2.000000    2319.500000    18.000000
75%     749.250000    42.000000     2.000000    3972.250000    24.000000
max     999.000000    75.000000     3.000000   18424.000000    72.000000


In [86]:
print(df.isnull().sum())

Unnamed: 0            0
Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
dtype: int64


In [87]:
print(df['Credit amount'].value_counts())

Credit amount
1478    3
1262    3
1258    3
1275    3
1393    3
       ..
1459    1
882     1
3758    1
1136    1
4576    1
Name: count, Length: 921, dtype: int64


### Data preprocessing


In [88]:
# Missing Value Handle
# Numeric columns → median
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Categorical columns → mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [89]:
print(df.isnull().sum())

Unnamed: 0          0
Age                 0
Sex                 0
Job                 0
Housing             0
Saving accounts     0
Checking account    0
Credit amount       0
Duration            0
Purpose             0
dtype: int64


### Encode Categorical Variables

In [90]:
df = pd.get_dummies(df, drop_first=True) # Use oned hot encoding

### Feature Scaling

In [91]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('Credit amount', axis=1))

In [92]:
# Train-Test split
from sklearn.model_selection import train_test_split

X = df.drop('Credit amount', axis=1)  # Features
y = df['Credit amount']               # Target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


### Model Training


### Logistic Regression

In [93]:
print("Unique labels in y_train:", len(set(y_train)))
print("Total samples in y_train:", len(y_train))
print("Sample labels:", list(set(y_train))[:10])


Unique labels in y_train: 750
Total samples in y_train: 800
Sample labels: [2051, 6148, 2058, 4110, 2063, 2064, 4113, 2069, 2073, 2080]


In [94]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression with more iterations
lr_model = LogisticRegression(max_iter=1000, solver='lbfgs')
lr_model.fit(X_train_scaled, y_train)


  y_type = type_of_target(y, input_name="y")


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [95]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))


MSE: 4015930.080895964
R² Score: 0.3399192534633566


In [96]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# 1️⃣ Check target
print("Unique labels in y_train:", len(set(y_train)))
print("Total samples in y_train:", len(y_train))

# 2️⃣ Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3️⃣ Train model with more iterations
lr_model = LogisticRegression(max_iter=1000, solver='lbfgs')
lr_model.fit(X_train_scaled, y_train)

# 4️⃣ Predictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_prob_lr = lr_model.predict_proba(X_test_scaled)

# 5️⃣ ROC-AUC handling for binary or multi-class
from sklearn.metrics import roc_auc_score, classification_report

print("\nLogistic Regression")
print(classification_report(y_test, y_pred_lr, zero_division=0))

try:
    if len(set(y_test)) == 2:  # Binary classification
        roc_auc = roc_auc_score(y_test, y_prob_lr[:, 1])
    else:  # Multi-class classification
        roc_auc = roc_auc_score(y_test, y_prob_lr, multi_class='ovr')
    print("ROC-AUC:", roc_auc)
except Exception as e:
    print("Could not calculate ROC-AUC:", e)

print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

# 6️⃣ Report
print("\nLogistic Regression")
print(classification_report(y_test, y_pred_lr, zero_division=0))
print("ROC-AUC:", roc_auc_score)


Unique labels in y_train: 750
Total samples in y_train: 800


  y_type = type_of_target(y, input_name="y")



Logistic Regression
              precision    recall  f1-score   support

         276       0.00      0.00      0.00       1.0
         368       0.00      0.00      0.00       1.0
         385       0.00      0.00      0.00       1.0
         433       0.00      0.00      0.00       1.0
         458       0.00      0.00      0.00       1.0
         484       0.00      0.00      0.00       1.0
         518       0.00      0.00      0.00       1.0
         585       0.00      0.00      0.00       1.0
         590       0.00      0.00      0.00       0.0
         601       0.00      0.00      0.00       1.0
         609       0.00      0.00      0.00       1.0
         626       0.00      0.00      0.00       1.0
         629       0.00      0.00      0.00       0.0
         660       0.00      0.00      0.00       1.0
         662       0.00      0.00      0.00       1.0
         672       0.00      0.00      0.00       0.0
         684       0.00      0.00      0.00       1.0
      

  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) f

### Decision Tree

In [97]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score


#  Identify target column
target_col = 'Credit amount'  # Change to your actual target column

#  Handle categorical variables (convert to numeric)
df = pd.get_dummies(df, drop_first=True)

# Separate features (X) and target (y)
X = df.drop(target_col, axis=1)
y = df[target_col]

#  Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

#  Predictions
y_pred = dt.predict(X_test)
y_prob = dt.predict_proba(X_test)[:, 1]

# 8. Evaluation
print("Decision Tree Results")
print(classification_report(y_test, y_pred))

df['Credit'] = df['Credit amount'].map({'good': 1, 'bad': 0}) 

Decision Tree Results
              precision    recall  f1-score   support

         276       0.00      0.00      0.00       1.0
         338       0.00      0.00      0.00       0.0
         368       0.00      0.00      0.00       1.0
         385       0.00      0.00      0.00       1.0
         433       0.00      0.00      0.00       1.0
         458       0.00      0.00      0.00       1.0
         484       0.00      0.00      0.00       1.0
         518       0.00      0.00      0.00       1.0
         585       0.00      0.00      0.00       1.0
         601       0.00      0.00      0.00       1.0
         609       0.00      0.00      0.00       1.0
         626       0.00      0.00      0.00       1.0
         640       0.00      0.00      0.00       0.0
         652       0.00      0.00      0.00       0.0
         654       0.00      0.00      0.00       0.0
         660       0.00      0.00      0.00       1.0
         662       0.00      0.00      0.00       1.0
     

  y_type = type_of_target(y, input_name="y")
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
  ys_types = set(type_of_target(x) for x in ys)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  ys_types = set(type_of_target(x) for x in ys)
 

### Random Forest

In [98]:
# 1. Check if df is empty
print("Shape of df:", df.shape)

# 2. Check if required columns exist
print("Columns in df:", df.columns.tolist())

# 3. Look at first few rows
print(df.head())

# 4. Ensure no NaN issues before splitting
df = df.dropna(subset=['Age', 'Job', 'Duration', 'Credit amount'])  # Drop rows where target/important features are missing


Shape of df: (1000, 21)
Columns in df: ['Unnamed: 0', 'Age', 'Job', 'Credit amount', 'Duration', 'Sex_male', 'Housing_own', 'Housing_rent', 'Saving accounts_moderate', 'Saving accounts_quite rich', 'Saving accounts_rich', 'Checking account_moderate', 'Checking account_rich', 'Purpose_car', 'Purpose_domestic appliances', 'Purpose_education', 'Purpose_furniture/equipment', 'Purpose_radio/TV', 'Purpose_repairs', 'Purpose_vacation/others', 'Credit']
   Unnamed: 0  Age  Job  Credit amount  Duration  Sex_male  Housing_own  \
0           0   67    2           1169         6      True         True   
1           1   22    2           5951        48     False         True   
2           2   49    1           2096        12      True         True   
3           3   45    2           7882        42      True        False   
4           4   53    2           4870        24      True        False   

   Housing_rent  Saving accounts_moderate  Saving accounts_quite rich  ...  \
0         False      

In [99]:
X = df[['Age', 'Job', 'Duration', 'Sex_male', 'Housing_own', 'Housing_rent',
        'Saving accounts_moderate', 'Saving accounts_quite rich', 'Saving accounts_rich',
        'Checking account_moderate', 'Checking account_rich', 'Purpose_car',
        'Purpose_domestic appliances', 'Purpose_education', 'Purpose_furniture/equipment',
        'Purpose_radio/TV', 'Purpose_repairs', 'Purpose_vacation/others', 'Credit']]

y = df['Credit amount']

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Shape of X: (1000, 19)
Shape of y: (1000,)


In [100]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Train
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Evaluate
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


MSE: 4338909.491572499
RMSE: 2083.0049187585946
MAE: 1416.74455
R²: 0.2868325497058627


In [101]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert 'Credit amount' to categories
bins = [0, 1000, 3000, df['Credit amount'].max()]
labels = ['Low', 'Medium', 'High']
df['Credit_Category'] = pd.cut(df['Credit amount'], bins=bins, labels=labels)

# Define target
y = df['Credit_Category']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 Score:", f1_score(y_test, y_pred, average='macro'))


Accuracy: 0.57
Precision: 0.49277784362530125
Recall: 0.46211451778462087
F1 Score: 0.4562275764509143


### model compare

In [108]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import numpy as np

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
}

# Train, predict, and evaluate
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    results.append([
        name,
        round(r2_score(y_test, preds), 4),  # R² score
        round(mean_absolute_error(y_test, preds), 4),  # MAE
        round(np.sqrt(mean_squared_error(y_test, preds)), 4)  # RMSE
    ])

# Display table
df_results = pd.DataFrame(results, columns=["Model", "R² Score", "MAE", "RMSE"])
print(df_results)






               Model  R² Score        MAE       RMSE
0  Linear Regression    0.3400  1392.7600  2003.8382
1      Decision Tree   -0.4816  1913.0900  3002.3302
2      Random Forest    0.2334  1424.8649  2159.6856


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer

# Convert continuous target into categories
# Adjust bins as per your data range
bins = [0, 5000, 10000, float('inf')]  
labels = [0, 1, 2]  # 0=Low, 1=Medium, 2=High
y_class = pd.cut(y, bins=bins, labels=labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

# Train, predict, and evaluate
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    results.append([
        name,
        round(accuracy_score(y_test, preds), 4),
        round(precision_score(y_test, preds, average='weighted'), 4),
        round(recall_score(y_test, preds, average='weighted'), 4),
        round(f1_score(y_test, preds, average='weighted'), 4)
    ])

# Display results
df_results = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score"])
print(df_results)


  type_true = type_of_target(y_true, input_name="y_true")


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets