In [2]:
# Import necessary libraries
import pandas as pd

# File paths
train_file = '/Users/prtimilsina/Learning_data_science/takeo_data_analytics/archive/train.csv'
test_file = '/Users/prtimilsina/Learning_data_science/takeo_data_analytics/archive/test.csv'
submission_file = '/Users/prtimilsina/Learning_data_science/takeo_data_analytics/archive/sample_submission.csv'

# Load datasets
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
sample_submission = pd.read_csv(submission_file)

# Display first few rows of the training dataset
train_data.head(), train_data.info(), train_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


(    Loan_ID Gender Married Dependents     Education Self_Employed  \
 0  LP001002   Male      No          0      Graduate            No   
 1  LP001003   Male     Yes          1      Graduate            No   
 2  LP001005   Male     Yes          0      Graduate           Yes   
 3  LP001006   Male     Yes          0  Not Graduate            No   
 4  LP001008   Male      No          0      Graduate            No   
 
    ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
 0             5849                0.0         NaN             360.0   
 1             4583             1508.0       128.0             360.0   
 2             3000                0.0        66.0             360.0   
 3             2583             2358.0       120.0             360.0   
 4             6000                0.0       141.0             360.0   
 
    Credit_History Property_Area Loan_Status  
 0             1.0         Urban           Y  
 1             1.0         Rural           N  
 2 

In [3]:
# Checking missing values
missing_values = train_data.isnull().sum()
missing_values_percentage = (missing_values / len(train_data)) * 100

# Display missing values and their percentages
missing_values_summary = pd.DataFrame({
    "Missing Values": missing_values,
    "Percentage": missing_values_percentage
}).sort_values(by="Percentage", ascending=False)

# Display the missing values summary
missing_values_summary

Unnamed: 0,Missing Values,Percentage
Credit_History,50,8.143322
Self_Employed,32,5.211726
LoanAmount,22,3.583062
Dependents,15,2.442997
Loan_Amount_Term,14,2.28013
Gender,13,2.117264
Married,3,0.488599
Loan_ID,0,0.0
Education,0,0.0
ApplicantIncome,0,0.0


In [4]:
# Handling missing values
# Impute categorical variables with the mode
for col in ['Credit_History', 'Self_Employed', 'Loan_Amount_Term', 'Gender', 'Married', 'Dependents']:
    train_data[col].fillna(train_data[col].mode()[0], inplace=True)

# Impute numerical variables with the median
train_data['LoanAmount'].fillna(train_data['LoanAmount'].median(), inplace=True)

# Verify that all missing values are handled
missing_values_after_imputation = train_data.isnull().sum()

# Display the updated missing values summary
missing_values_after_imputation

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[col].fillna(train_data[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['LoanAmount'].fillna(train_data['LoanAmount'].median(), inplace=True)


Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [5]:
# Encode categorical variables using one-hot encoding and label encoding
train_data_encoded = train_data.copy()

# One-hot encode multi-category variables
train_data_encoded = pd.get_dummies(train_data_encoded, columns=['Property_Area'], drop_first=True)

# Label encode binary variables
train_data_encoded['Gender'] = train_data_encoded['Gender'].map({'Male': 1, 'Female': 0})
train_data_encoded['Married'] = train_data_encoded['Married'].map({'Yes': 1, 'No': 0})
train_data_encoded['Education'] = train_data_encoded['Education'].map({'Graduate': 1, 'Not Graduate': 0})
train_data_encoded['Self_Employed'] = train_data_encoded['Self_Employed'].map({'Yes': 1, 'No': 0})
train_data_encoded['Loan_Status'] = train_data_encoded['Loan_Status'].map({'Y': 1, 'N': 0})

# Handle 'Dependents' column by replacing '3+' with 3 and converting to numeric
train_data_encoded['Dependents'] = train_data_encoded['Dependents'].replace('3+', 3).astype(int)

# Display the processed training data structure
train_data_encoded.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Semiurban,Property_Area_Urban
0,LP001002,1,0,0,1,0,5849,0.0,128.0,360.0,1.0,1,False,True
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,False,False
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,False,True
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,False,True
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,False,True


In [6]:
from sklearn.preprocessing import StandardScaler

# Select numerical features for scaling
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

# Initialize the scaler
scaler = StandardScaler()

# Scale numerical features
train_data_encoded[numerical_features] = scaler.fit_transform(train_data_encoded[numerical_features])

# Display scaled data
train_data_encoded[numerical_features].head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
0,0.072991,-0.554487,-0.211241,0.273231
1,-0.134412,-0.038732,-0.211241,0.273231
2,-0.393747,-0.554487,-0.948996,0.273231
3,-0.462062,0.25198,-0.306435,0.273231
4,0.097728,-0.554487,-0.056551,0.273231


In [7]:
# Split the data into features (X) and target variable (y)
X = train_data_encoded.drop(columns=['Loan_ID', 'Loan_Status'])  # Drop ID and target
y = train_data_encoded['Loan_Status']

# Split into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check the shape of the split data
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((491, 12), (123, 12), (491,), (123,))

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

# Initialize and train logistic regression model
baseline_model = LogisticRegression(random_state=42)
baseline_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = baseline_model.predict(X_valid)
y_pred_proba = baseline_model.predict_proba(X_valid)[:, 1]

# Evaluate model performance
accuracy = accuracy_score(y_valid, y_pred)
roc_auc = roc_auc_score(y_valid, y_pred_proba)
classification_rep = classification_report(y_valid, y_pred)
conf_matrix = confusion_matrix(y_valid, y_pred)

# Display the results
accuracy, roc_auc, classification_rep, conf_matrix

(0.8617886178861789,
 0.848606811145511,
 '              precision    recall  f1-score   support\n\n           0       0.96      0.58      0.72        38\n           1       0.84      0.99      0.91        85\n\n    accuracy                           0.86       123\n   macro avg       0.90      0.78      0.81       123\nweighted avg       0.88      0.86      0.85       123\n',
 array([[22, 16],
        [ 1, 84]]))

In [9]:
# Import advanced models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Initialize Random Forest and Gradient Boosting models
random_forest = RandomForestClassifier(random_state=42)
gradient_boosting = GradientBoostingClassifier(random_state=42)

# Train the models
random_forest.fit(X_train, y_train)
gradient_boosting.fit(X_train, y_train)

# Make predictions on the validation set
rf_pred = random_forest.predict(X_valid)
rf_pred_proba = random_forest.predict_proba(X_valid)[:, 1]

gb_pred = gradient_boosting.predict(X_valid)
gb_pred_proba = gradient_boosting.predict_proba(X_valid)[:, 1]

# Evaluate Random Forest
rf_accuracy = accuracy_score(y_valid, rf_pred)
rf_roc_auc = roc_auc_score(y_valid, rf_pred_proba)
rf_classification_rep = classification_report(y_valid, rf_pred)
rf_conf_matrix = confusion_matrix(y_valid, rf_pred)

# Evaluate Gradient Boosting
gb_accuracy = accuracy_score(y_valid, gb_pred)
gb_roc_auc = roc_auc_score(y_valid, gb_pred_proba)
gb_classification_rep = classification_report(y_valid, gb_pred)
gb_conf_matrix = confusion_matrix(y_valid, gb_pred)

# Results for both models
{
    "Random Forest": {
        "Accuracy": rf_accuracy,
        "ROC-AUC": rf_roc_auc,
        "Classification Report": rf_classification_rep,
        "Confusion Matrix": rf_conf_matrix,
    },
    "Gradient Boosting": {
        "Accuracy": gb_accuracy,
        "ROC-AUC": gb_roc_auc,
        "Classification Report": gb_classification_rep,
        "Confusion Matrix": gb_conf_matrix,
    },
}


{'Random Forest': {'Accuracy': 0.8211382113821138,
  'ROC-AUC': 0.808359133126935,
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       0.77      0.61      0.68        38\n           1       0.84      0.92      0.88        85\n\n    accuracy                           0.82       123\n   macro avg       0.80      0.76      0.78       123\nweighted avg       0.82      0.82      0.81       123\n',
  'Confusion Matrix': array([[23, 15],
         [ 7, 78]])},
 'Gradient Boosting': {'Accuracy': 0.8211382113821138,
  'ROC-AUC': 0.7625386996904024,
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       0.81      0.55      0.66        38\n           1       0.82      0.94      0.88        85\n\n    accuracy                           0.82       123\n   macro avg       0.82      0.75      0.77       123\nweighted avg       0.82      0.82      0.81       123\n',
  'Confusion Matrix': array([[21, 17],