In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# --- 1. Exploratory Data Analysis (EDA) ---
print("--- EDA: Initial Data Exploration ---")
# Load the dataset
df = pd.read_csv('bankloans.csv')

# Display the first few rows of the dataframe
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Get a concise summary of the dataframe
print("\nDataframe information:")
print(df.info())

# Get descriptive statistics
print("\nDescriptive statistics:")
print(df.describe())

# Check for missing values
print("\nMissing values per column BEFORE cleaning:")
print(df.isnull().sum())


# --- 2. Feature Engineering & Data Cleaning ---
print("\n--- Feature Engineering & Data Cleaning ---")
# Drop rows with any missing values, which are causing the ValueError
df.dropna(inplace=True)
print("\nMissing values per column AFTER cleaning:")
print(df.isnull().sum())

# Create a new feature 'total_debt' by summing 'creddebt' and 'othdebt'
df['total_debt'] = df['creddebt'] + df['othdebt']
print("\nNew 'total_debt' column created. First 5 rows with new feature:")
print(df[['creddebt', 'othdebt', 'total_debt']].head())


# --- 3. Model Building ---
print("\n--- Model Building ---")
# Define features (X) and target (y) from the cleaned data
# 'default' is the target variable
features = ['age', 'ed', 'employ', 'address', 'income', 'debtinc', 'creddebt', 'othdebt', 'total_debt']
X = df[features]
y = df['default']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"\nTraining set size: {len(X_train)} samples")
print(f"Test set size: {len(X_test)} samples")

# Initialize and train the Logistic Regression model
model = LogisticRegression(solver='liblinear', random_state=42)
model.fit(X_train, y_train)
print("\nLogistic Regression model trained successfully.")


# --- 4. Model Evaluation ---
print("\n--- Model Evaluation ---")
# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


# --- 5. Business Insights ---
print("\n--- Business Insights ---")
# Get the coefficients of the model to understand feature importance
# Note: For Logistic Regression, a positive coefficient increases the log-odds of the target
# being 1, and a negative coefficient decreases it.
coefficients = pd.DataFrame(model.coef_.reshape(-1, 1), index=features, columns=['Coefficient'])
print("\nModel Coefficients (Insight on feature importance):")
print(coefficients.sort_values(by='Coefficient', ascending=False))

--- EDA: Initial Data Exploration ---

First 5 rows of the dataset:
   age  ed  employ  address  income  debtinc   creddebt   othdebt  default
0   41   3      17       12     176      9.3  11.359392  5.008608      1.0
1   27   1      10        6      31     17.3   1.362202  4.000798      0.0
2   40   1      15       14      55      5.5   0.856075  2.168925      0.0
3   41   1      15       14     120      2.9   2.658720  0.821280      0.0
4   24   2       2        0      28     17.3   1.787436  3.056564      1.0

Dataframe information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1150 entries, 0 to 1149
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1150 non-null   int64  
 1   ed        1150 non-null   int64  
 2   employ    1150 non-null   int64  
 3   address   1150 non-null   int64  
 4   income    1150 non-null   int64  
 5   debtinc   1150 non-null   float64
 6   creddebt  1150 non-null   float64
 