In [1]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import joblib

# --- Load your dataset ---
df = pd.read_csv('emi_prediction_dataset.csv')

# Separate targets
y_class = df['emi_eligibility']          # Classification target
y_reg   = df['max_monthly_emi']          # Regression target
X       = df.drop(columns=['emi_eligibility', 'max_monthly_emi'])


In [10]:


# --- Step 1: Identify corrupted numeric entries ---
for col in df.columns:
    bad_rows = df[df[col].astype(str).str.contains(r'[A-Za-z]|[.]{2,}', na=False)]
    if not bad_rows.empty:
        print(f"‚ö†Ô∏è Bad entries found in column: {col}")
        print(bad_rows[[col]].head())

# --- Step 2: Clean all numeric columns properly ---
numeric_features = [
    'age','monthly_salary','years_of_employment','family_size','dependents',
    'school_fees','college_fees','travel_expenses','groceries_utilities',
    'other_monthly_expenses','current_emi_amount','existing_loans',
    'credit_score','bank_balance','emergency_fund','requested_amount','requested_tenure'
]

for col in numeric_features:
    df[col] = (
        df[col]
        .astype(str)
        .str.replace(',', '', regex=False)  # remove commas
        .str.replace('..', '.', regex=False)  # fix double dots
        .str.extract(r'(\d+\.?\d*)')[0]  # extract valid numeric part
    )
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col].fillna(df[col].median(), inplace=True)

print("‚úÖ All numeric columns cleaned successfully!")

# --- Step 3: Check that all are numeric now ---
print(df[numeric_features].dtypes)


‚ö†Ô∏è Bad entries found in column: gender
   gender
0  Female
1  Female
2    Male
3  Female
4  Female
‚ö†Ô∏è Bad entries found in column: marital_status
  marital_status
0        Married
1        Married
2        Married
3        Married
4        Married
‚ö†Ô∏è Bad entries found in column: education
      education
0  Professional
1      Graduate
2  Professional
3   High School
4  Professional
‚ö†Ô∏è Bad entries found in column: employment_type
  employment_type
0         Private
1         Private
2         Private
3         Private
4         Private
‚ö†Ô∏è Bad entries found in column: company_type
  company_type
0     Mid-size
1          MNC
2      Startup
3     Mid-size
4     Mid-size
‚ö†Ô∏è Bad entries found in column: house_type
  house_type
0     Rented
1     Family
2        Own
3        Own
4     Family
‚ö†Ô∏è Bad entries found in column: monthly_rent
    monthly_rent
116          nan
183          nan
226          nan
429          nan
638          nan
‚ö†Ô∏è Bad entries found in

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

‚úÖ All numeric columns cleaned successfully!
age                       float64
monthly_salary            float64
years_of_employment       float64
family_size                 int64
dependents                  int64
school_fees               float64
college_fees              float64
travel_expenses           float64
groceries_utilities       float64
other_monthly_expenses    float64
current_emi_amount        float64
existing_loans            float64
credit_score              float64
bank_balance              float64
emergency_fund            float64
requested_amount          float64
requested_tenure          float64
dtype: object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [7]:
numeric_features = [
    'age','monthly_salary','years_of_employment','family_size','dependents',
    'school_fees','college_fees','travel_expenses','groceries_utilities',
    'other_monthly_expenses','current_emi_amount','existing_loans',
    'credit_score','bank_balance','emergency_fund','requested_amount','requested_tenure'
]

categorical_features = [
    'gender','marital_status','education','employment_type',
    'company_type','house_type','emi_scenario'
]

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [8]:
# Separate targets
y_class = df['emi_eligibility']
y_reg   = df['max_monthly_emi']
X       = df.drop(columns=['emi_eligibility', 'max_monthly_emi'])


In [9]:
X[numeric_features].dtypes


Unnamed: 0,0
age,float64
monthly_salary,float64
years_of_employment,float64
family_size,int64
dependents,int64
school_fees,float64
college_fees,float64
travel_expenses,float64
groceries_utilities,float64
other_monthly_expenses,float64


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=42)

classifier_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', clf)
])

classifier_pipeline.fit(X_train, y_train)
joblib.dump(classifier_pipeline, 'emi_classifier.pkl')
print("‚úÖ emi_classifier.pkl saved")


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


‚úÖ emi_classifier.pkl saved


In [14]:
!pip install streamlit pyngrok --quiet


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m10.2/10.2 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m90.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
import streamlit as st
import pandas as pd
import joblib

st.title("üí∞ EMIPredict AI ‚Äì Financial Risk Assessment")

uploaded_file = st.file_uploader("üìÇ Upload customer data CSV")

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)
    st.write("Preview of uploaded data:", df.head())

    # Load both pipelines
    classifier = joblib.load('emi_classifier.pkl')
    regressor  = joblib.load('emi_regressor.pkl')

    # Predictions
    class_preds = classifier.predict(df)
    reg_preds   = regressor.predict(df)

    df['Predicted_Eligibility'] = class_preds
    df['Predicted_Max_EMI']     = reg_preds

    st.success("‚úÖ Predictions generated successfully!")
    st.dataframe(df[['Predicted_Eligibility', 'Predicted_Max_EMI']])


2025-11-07 18:18:32.327 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


In [29]:
from pyngrok import ngrok
ngrok.set_auth_token("35A5LOyycsIqwjFr81DpDlJv179_32Qhq9JtdhbA4ho9yh9U7")


In [19]:
!kill $(ps -ef | grep streamlit | awk '{print $2}') >/dev/null 2>&1
!kill $(ps -ef | grep ngrok | awk '{print $2}') >/dev/null 2>&1


^C
^C


In [5]:
!streamlit run app.py &>/content/logs.txt &
from pyngrok import ngrok
public_url = ngrok.connect(8501)
public_url


<NgrokTunnel: "https://incorrect-stumblingly-carmine.ngrok-free.dev" -> "http://localhost:8501">

In [6]:
!streamlit run app.py &>/content/logs.txt &

public_url = ngrok.connect(8501)
print("üîó Streamlit App URL:", public_url)


üîó Streamlit App URL: NgrokTunnel: "https://incorrect-stumblingly-carmine.ngrok-free.dev" -> "http://localhost:8501"


In [3]:
!kill $(ps -ef | grep streamlit | awk '{print $2}') >/dev/null 2>&1
!kill $(ps -ef | grep ngrok | awk '{print $2}') >/dev/null 2>&1
!pkill -f ngrok >/dev/null 2>&1
!pkill -f streamlit >/dev/null 2>&1
!rm -rf /root/.ngrok2/ngrok.yml


^C
^C
^C
^C
