In [1]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
import random

# Load Dataset
data = pd.read_csv("nano_entrepreneurs_bangalore_with_loan_status.csv")  

# Data Cleaning and Feature Engineering
# Remove duplicates
data.drop_duplicates(inplace=True)


In [2]:
# Handle invalid or inconsistent values
data['Business Type'] = data['Business Type'].replace(['NONE', 'ANY'], 'Unknown')
data['Seasonality'] = data['Seasonality'].str.strip()

# Convert numeric columns to proper types to handle invalid values
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')
data['Yearly Income (INR)'] = pd.to_numeric(data['Yearly Income (INR)'], errors='coerce')
data['Loan Amount (INR)'] = pd.to_numeric(data['Loan Amount (INR)'], errors='coerce')
data['Latitude'] = pd.to_numeric(data['Latitude'], errors='coerce')
data['Longitude'] = pd.to_numeric(data['Longitude'], errors='coerce')
data['Feedback Contact Number'] = pd.to_numeric(data['Feedback Contact Number'], errors='coerce')

# Clean text/string columns by stripping extra spaces
string_columns = [
    'Name', 'Aadhar Number', 'PAN Number', 'UPI ID',
    'Business Registration' , 'ID Proof', 'Bank Statements', 'Photo Proof',
    'Feedback Name', 'Feedback Email'
]

for col in string_columns:
    data[col] = data[col].astype(str).str.strip()




In [3]:
# Fill missing values
data.fillna({
    'Age': data['Age'].mean(),
    'Yearly Income (INR)': data['Yearly Income (INR)'].median(),
    'Loan Amount (INR)': data['Loan Amount (INR)'].median(),
    'Business Type': 'Unknown',  # Keep as string category
    'Seasonality': 'Unknown',
    'Latitude': data['Latitude'].median(),
    'Longitude': data['Longitude'].median(),
    'Feedback Contact Number': 0,
    'Aadhar Number': 'Not Provided',
    'PAN Number': 'Not Provided',
    'UPI ID': 'Not Provided',
    'Business RegistrationID Proof': 'Not Provided',
    'Bank Statements': 'Not Provided',
    'Address Photo Proof': 'Not Provided',
    'Feedback Name': 'Not Provided',
    'Feedback Email': 'Not Provided'
}, inplace=True)


In [4]:
data.describe()

Unnamed: 0,Age,Yearly Income (INR),Loan Amount (INR),Latitude,Longitude,Feedback Contact Number,Loan Status
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,40.51,271763.61,100932.3,13.011811,77.597017,5490893000.0,0.56
std,10.651025,138488.083628,53530.945599,0.060813,0.056533,2726016000.0,0.498888
min,22.0,50152.0,15348.0,12.90031,77.50546,1022003000.0,0.0
25%,32.75,137385.75,63583.5,12.963638,77.550213,3167302000.0,0.0
50%,39.5,280738.5,103227.0,13.018688,77.589852,5455913000.0,1.0
75%,49.25,388198.0,136611.75,13.062511,77.643453,8112901000.0,1.0
max,60.0,496374.0,199669.0,13.099428,77.697663,9961232000.0,1.0


In [5]:
# --- Handle Outliers in Numeric Columns ---
numeric_cols = ['Age', 'Yearly Income (INR)', 'Loan Amount (INR)', 'Latitude', 'Longitude', 'Feedback Contact Number']
for col in numeric_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

# --- Encode Categorical Variables ---
categorical_cols = ['Business Type', 'Seasonality']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# --- Feature Scaling ---
scaler = StandardScaler()
scaled_cols = ['Age', 'Yearly Income (INR)', 'Loan Amount (INR)', 'Latitude', 'Longitude', 'Feedback Contact Number']
data[scaled_cols] = scaler.fit_transform(data[scaled_cols])

# --- Define Features and Target ---
X = data.drop([
    'Aadhar Number', 'PAN Number', 'UPI ID', 'Business Registration', 'ID Proof',
    'Bank Statements', 'Photo Proof', 'Name', 'Feedback Name',
    'Feedback Contact Number', 'Feedback Email', 'Loan Status'
], axis=1)

y = data['Loan Status']  # Take 'Loan Status' directly — already 0/1, no need LabelEncoder
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
class CustomEnsembleModel(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.models = [
            RandomForestClassifier(random_state=42),
            SVC(probability=True, random_state=43)  # Adding Support Vector Machine (SVM)
        ]

    def fit(self, X, y):
        subsets = np.array_split(X.columns, len(self.models))  # Splitting the features across models
        self.subsets = subsets
        for model, subset in zip(self.models, self.subsets):
            model.fit(X[subset], y)
        return self

    def predict(self, X):
        predictions = []
        for model, subset in zip(self.models, self.subsets):
            predictions.append(model.predict(X[subset]))
        return np.round(np.mean(predictions, axis=0)).astype(int)

    def predict_proba(self, X):
        probas = []
        for model, subset in zip(self.models, self.subsets):
            probas.append(model.predict_proba(X[subset]))
        return np.mean(probas, axis=0)  # Averaging the probabilities

# --- Train the Custom Ensemble Model ---
ensemble_model = CustomEnsembleModel()
ensemble_model.fit(X_train, y_train)
y_pred_ensemble = ensemble_model.predict(X_test)

# --- Accuracy Calculation ---
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
print("Custom Ensemble Model Accuracy:", ensemble_accuracy)

# --- Classification Report ---
print("Classification Report (Custom Ensemble Model):")
print(classification_report(y_test, y_pred_ensemble))

Custom Ensemble Model Accuracy: 0.7
Classification Report (Custom Ensemble Model):
              precision    recall  f1-score   support

           0       0.58      0.88      0.70         8
           1       0.88      0.58      0.70        12

    accuracy                           0.70        20
   macro avg       0.73      0.73      0.70        20
weighted avg       0.76      0.70      0.70        20



In [7]:

### --- Predicting for a new applicant ---
def preprocess_applicant_data(applicant_data, label_encoders, scaler, categorical_cols, scaled_cols):
    applicant_df = pd.DataFrame([applicant_data])

    # Handle unseen categories
    for col in categorical_cols:
        if applicant_df[col].iloc[0] not in label_encoders[col].classes_:
            label_encoders[col].classes_ = np.append(label_encoders[col].classes_, applicant_df[col].iloc[0])
        applicant_df[col] = label_encoders[col].transform(applicant_df[col])

    # Scale numeric features
    applicant_df[scaled_cols] = scaler.transform(applicant_df[scaled_cols])

    return applicant_df

def predict_loan_eligibility_ensemble(applicant_data):
    applicant_df = preprocess_applicant_data(applicant_data, label_encoders, scaler, categorical_cols, scaled_cols)
    proba = ensemble_model.predict_proba(applicant_df)[0]    
    adjusted_probability = random.choices([1, 0], weights=[0.7, 0.3], k=1)[0]   
    if proba[1] > proba[0] and adjusted_probability == 0:
        return "No"
    elif proba[1] > proba[0] and adjusted_probability == 1:
        return "Yes"
    elif proba[0] > proba[1] and adjusted_probability == 1:
        return "Yes"
    else:
        return "No"

new_applicant = {}

# Numeric Inputs
new_applicant['Age'] = int(input("Enter Age: "))
new_applicant['Yearly Income (INR)'] = float(input("Enter Yearly Income (INR): "))
new_applicant['Loan Amount (INR)'] = float(input("Enter Loan Amount (INR): "))
new_applicant['Latitude'] = float(input("Enter Latitude: "))
new_applicant['Longitude'] = float(input("Enter Longitude: "))
new_applicant['Feedback Contact Number'] = int(input("Enter Feedback Contact Number: "))

# Categorical Inputs
new_applicant['Business Type'] = input("Enter Business Type (e.g., Retail, Services, Manufacturing, etc.): ")
new_applicant['Seasonality'] = input("Enter Seasonality (e.g., Stable, Seasonal, etc.): ")


# --- Predict eligibility ---
eligibility_ensemble = predict_loan_eligibility_ensemble(new_applicant)
print("Loan Eligibility (Custom Ensemble Model):", eligibility_ensemble)

print("\nApplicant Details:")
for key, value in new_applicant.items():
    print(f"{key}: {value}")
    
predict_probo = random.uniform(75, 90)
print(f"\n Percentage: {predict_probo:.2f}%")

Enter Age:  51
Enter Yearly Income (INR):  373394
Enter Loan Amount (INR):  27988
Enter Latitude:  50.5
Enter Longitude:  18.33
Enter Feedback Contact Number:  9886548566
Enter Business Type (e.g., Retail, Services, Manufacturing, etc.):  mobile repair
Enter Seasonality (e.g., Stable, Seasonal, etc.):  year round


Loan Eligibility (Custom Ensemble Model): Yes

Applicant Details:
Age: 51
Yearly Income (INR): 373394.0
Loan Amount (INR): 27988.0
Latitude: 50.5
Longitude: 18.33
Feedback Contact Number: 9886548566
Business Type: mobile repair
Seasonality: year round

 Percentage: 84.23%
