<a href="https://colab.research.google.com/github/saniyashk1542/ML_25-26/blob/main/ML_Lab_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

# Set seed for reproducibility
np.random.seed(42)

# Number of records
n_samples = 5000

# Generate realistic features
age = np.random.randint(18, 70, size=n_samples)  # Age between 18 and 70
income = np.random.normal(loc=50000, scale=15000, size=n_samples).astype(int)  # Average income
credit_score = np.random.normal(loc=650, scale=50, size=n_samples).astype(int)  # Average credit score

# Generate binary label: Default (1 = likely to default, 0 = not likely)
default = ((income < 40000) & (credit_score < 600)).astype(int)

# Create DataFrame
df = pd.DataFrame({
    'Age': age,
    'Income': income,
    'CreditScore': credit_score,
    'Default': default
})

# Display sample
print(df.head())



   Age  Income  CreditScore  Default
0   56   48353          720        0
1   69   57462          670        0
2   46   44219          690        0
3   32   56306          597        0
4   60   37034          699        0


In [2]:
from sklearn.model_selection import train_test_split

# Features and label
X = df[['Age', 'Income', 'CreditScore']]
y = df['Default']

# Split into train and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

Train size: 4000
Test size: 1000


In [3]:
# First split: 80% temp, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Second split: 75% train, 25% validation from temp
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)

print("Train size:", X_train.shape[0])
print("Validation size:", X_val.shape[0])
print("Test size:", X_test.shape[0])

Train size: 3000
Validation size: 1000
Test size: 1000


In [4]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize K-Fold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
for train_index, test_index in kf.split(X, y):
    X_train_k, X_test_k = X.iloc[train_index], X.iloc[test_index]
    y_train_k, y_test_k = y.iloc[train_index], y.iloc[test_index]

    # Train logistic regression
    model = LogisticRegression()
    model.fit(X_train_k, y_train_k)
    y_pred_k = model.predict(X_test_k)

    print(f"Fold {fold} Accuracy: {accuracy_score(y_test_k, y_pred_k):.4f}")
    fold += 1


Fold 1 Accuracy: 0.9640
Fold 2 Accuracy: 0.9800
Fold 3 Accuracy: 0.9640
Fold 4 Accuracy: 0.9640
Fold 5 Accuracy: 0.9730


In [5]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Train model on full training set
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)


# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:\n", cm)

# Evaluate
print("Test Accuracy:", accuracy_score(y_test, y_pred))



Confusion Matrix:
 [[946   7]
 [ 21  26]]
Test Accuracy: 0.972


In [6]:
from sklearn.metrics import f1_score, recall_score, precision_score

# Predict on test set
y_pred = model.predict(X_test)

# F1 Score
f1 = f1_score(y_test, y_pred)

# Recall
recall = recall_score(y_test, y_pred)

# Precision
precision = precision_score(y_test, y_pred)

# Display results
print(f"F1 Score: {f1:.2f}")
print(f"Recall: {recall:.2f}")
print(f"Precision: {precision:.2f}")

F1 Score: 0.65
Recall: 0.55
Precision: 0.79
