In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import joblib
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier

In [5]:
model_df = {}

In [8]:
def model_val(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{model} accuracy is {accuracy_score(y_test, y_pred)}")

    score = cross_val_score(model, X, y, cv=5)
    print(f"{model} Avg cross val score is {np.mean(score)}")
    model_df[model] = round(np.mean(score)*100, 2)

In [3]:
data = pd.read_csv('./Train_Loan_Home.csv')

data = data.drop('Loan_ID', axis=1)
data = data.drop('ApplicantIncome', axis=1)
data = data.drop('CoapplicantIncome', axis=1)
data = data.drop('LoanAmount', axis=1)

columns = ['Gender', 'Dependents',
               'LoanAmount_In_Taka', 'Loan_Amount_Term']

In [4]:
data = data.dropna(subset=columns)
data['Self_Employed'] = data['Self_Employed'].fillna(
        data['Self_Employed'].mode()[0])
data['Credit_History'] = data['Credit_History'].fillna(
        data['Credit_History'].mode()[0])

data['Dependents'] = data['Dependents'].replace(to_replace="3+", value='4')
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0}).astype('int')
data['Married'] = data['Married'].map({'Yes': 1, 'No': 0}).astype('int')
data['Education'] = data['Education'].map(
        {'Graduate': 1, 'Not Graduate': 0}).astype('int')
data['Self_Employed'] = data['Self_Employed'].map(
        {'Yes': 1, 'No': 0}).astype('int')
data['Property_Area'] = data['Property_Area'].map(
        {'Rural': 0, 'Semiurban': 2, 'Urban': 1}).astype('int')
data['Loan_Status'] = data['Loan_Status'].map(
        {'Y': 1, 'N': 0}).astype('int')

In [6]:
X = data.drop('Loan_Status', axis=1)
y = data['Loan_Status']

cols = ['ApplicantIncome_In_Taka', 'CoapplicantIncome_In_Taka',
            'LoanAmount_In_Taka', 'Loan_Amount_Term']

In [7]:
st = StandardScaler()
X[cols] = st.fit_transform(X[cols])

In [14]:
model = LogisticRegression()
model_val(model, X, y)

model_svc = svm.SVC()
model_val(model_svc, X, y)

model = DecisionTreeClassifier()
model_val(model, X, y)
model = RandomForestClassifier()
model_val(model, X, y)

model = GradientBoostingClassifier()
model_val(model, X, y)

LogisticRegression() accuracy is 0.6782608695652174
LogisticRegression() Avg cross val score is 0.6875972540045766
SVC() accuracy is 0.6869565217391305
SVC() Avg cross val score is 0.6928299008390542
DecisionTreeClassifier() accuracy is 0.7478260869565218
DecisionTreeClassifier() Avg cross val score is 0.7173302822273074
RandomForestClassifier() accuracy is 0.7565217391304347
RandomForestClassifier() Avg cross val score is 0.7836613272311214
GradientBoostingClassifier() accuracy is 0.7652173913043478
GradientBoostingClassifier() Avg cross val score is 0.7627154843630816


In [10]:
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                    "solver": ['liblinear']}
rs_log_reg = RandomizedSearchCV(LogisticRegression(), param_distributions=log_reg_grid,
                                    n_iter=20, cv=5, verbose=True)
rs_log_reg.fit(X, y)

svc_grid = {'C': [0.25, 0.50, 0.75, 1], "kernel": ["linear"]}
rs_svc = RandomizedSearchCV(svm.SVC(),
                                param_distributions=svc_grid,
                                cv=5,
                                n_iter=20,
                                verbose=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [11]:
rs_svc.fit(X, y)

rf_grid = {'n_estimators': np.arange(10, 1000, 10),
               'max_features': ['auto', 'sqrt'],
               'max_depth': [None, 3, 5, 20, 30],
               'min_samples_split': [2, 5, 20, 50, 100],
               'min_samples_leaf': [1, 2, 5, 10]
               }
rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                               param_distributions=rf_grid,
                               cv=5,
                               n_iter=20,
                               verbose=True)
rs_rf.fit(X, y)

X = data.drop('Loan_Status', axis=1)
y = data['Loan_Status']



Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits


50 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/home/cseku/.local/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/cseku/.local/lib/python3.12/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/home/cseku/.local/lib/python3.12/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/home/cseku/.local/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    r

In [13]:
joblib.dump(rs_rf.best_estimator_, 'loan_status_predict')
model = joblib.load('loan_status_predict')