In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
import pickle

# Load the data
data = pd.read_csv('loan_data.csv')
data.fillna(data.mode().iloc[0], inplace=True)

# Drop loan_id column
data = data.iloc[:, 1:]

data['Dependents'].replace('3+', '4', inplace=True)
data['Dependents'] = pd.to_numeric(data['Dependents'])

# Ensure categorical columns are recognized
data['Gender'] = data['Gender'].astype('category')
data['Married'] = data['Married'].astype('category')
data['Education'] = data['Education'].astype('category')
data['Self_Employed'] = data['Self_Employed'].astype('category')
data['Credit_History'] = data['Credit_History'].astype('category')
data['Property_Area'] = data['Property_Area'].astype('category')

# Split the data into features (X) and target (y)
X = data.drop('Loan_Status', axis=1)
y = data['Loan_Status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a column transformer for feature scaling
categorical_features = list(X.select_dtypes(include=['category']).columns)
numeric_features = list(X.select_dtypes(exclude=['category']).columns)

preprocessor = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_features),
    (StandardScaler(), numeric_features)
)

# Fit and transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Create and train the model
model = RandomForestClassifier()
model.fit(X_train_transformed, y_train)

# Save the trained model
with open('trained_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Load the trained model
with open('trained_model.pkl', 'rb') as model_file:
    trained_model = pickle.load(model_file)

# Make predictions
# Assume X_new is the new user input data

X_new = [['Male', 'Yes', 3, 'Graduate', 'Yes', 0, 1000, 5000000, 36, 'Yes', 'Urban']]

# Convert X_new to a DataFrame
X_new_df = pd.DataFrame(X_new, columns=X.columns)

# Ensure categorical columns are recognized
X_new_df['Gender'] = X_new_df['Gender'].astype('category')
X_new_df['Married'] = X_new_df['Married'].astype('category')
X_new_df['Education'] = X_new_df['Education'].astype('category')
X_new_df['Self_Employed'] = X_new_df['Self_Employed'].astype('category')
X_new_df['Credit_History'] = X_new_df['Credit_History'].astype('category')
X_new_df['Property_Area'] = X_new_df['Property_Area'].astype('category')

# Apply the same transformations to the new data as done during training
X_new_transformed = preprocessor.transform(X_new_df)

# Make predictions
prediction = trained_model.predict(X_new_transformed)

# Print the prediction
print("Predicted Loan Status:", prediction[0])

# Evaluate the model
y_pred = trained_model.predict(X_test_transformed)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Predicted Loan Status: N
Accuracy: 0.8181818181818182
Classification Report:
               precision    recall  f1-score   support

           N       1.00      0.33      0.50        21
           Y       0.80      1.00      0.89        56

    accuracy                           0.82        77
   macro avg       0.90      0.67      0.69        77
weighted avg       0.85      0.82      0.78        77



In [4]:
from joblib import dump
dump(model,'./../saved_models/model.joblib')

['./../saved_models/model.joblib']