In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Read the training data
train_data = pd.read_csv("training_set.csv")

# Drop unnecessary columns from the training data
train_data = train_data.drop(["Loan_ID"], axis=1)

# Preprocess the data: convert '3+' to '3' in Dependents column
train_data["Dependents"] = train_data["Dependents"].str.replace('+', '')

# Convert Dependents column to numeric type
train_data["Dependents"] = pd.to_numeric(train_data["Dependents"], errors='coerce')



# Encode categorical variables in the training data
label_encoder = LabelEncoder()
train_data["Gender"] = label_encoder.fit_transform(train_data["Gender"].astype(str))
train_data["Married"] = label_encoder.fit_transform(train_data["Married"].astype(str))
train_data["Education"] = label_encoder.fit_transform(train_data["Education"].astype(str))
train_data["Self_Employed"] = label_encoder.fit_transform(train_data["Self_Employed"].astype(str))
train_data["Property_Area"] = label_encoder.fit_transform(train_data["Property_Area"].astype(str))
train_data["Loan_Status"] = label_encoder.fit_transform(train_data["Loan_Status"].astype(str))

# Perform one-hot encoding for remaining categorical variables
train_data_encoded = pd.get_dummies(train_data)

# Split the training data into features (X) and target variable (y)
X_train = train_data_encoded.drop(["Loan_Status"], axis=1)
y_train = train_data_encoded["Loan_Status"]

# Train a decision tree classifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Read the testing data
test_data = pd.read_csv("testing_set.csv")

# Drop unnecessary columns from the testing data
test_data = test_data.drop(["Loan_ID"], axis=1)

# Preprocess the data: convert '3+' to '3' in Dependents column
test_data["Dependents"] = test_data["Dependents"].str.replace('+', '')

# Convert Dependents column to numeric type
test_data["Dependents"] = pd.to_numeric(test_data["Dependents"], errors='coerce')



# Encode categorical variables in the testing data
test_data["Gender"] = label_encoder.fit_transform(test_data["Gender"].astype(str))
test_data["Married"] = label_encoder.fit_transform(test_data["Married"].astype(str))
test_data["Education"] = label_encoder.fit_transform(test_data["Education"].astype(str))
test_data["Self_Employed"] = label_encoder.fit_transform(test_data["Self_Employed"].astype(str))
test_data["Property_Area"] = label_encoder.fit_transform(test_data["Property_Area"].astype(str))

# Perform one-hot encoding for remaining categorical variables
test_data_encoded = pd.get_dummies(test_data)

# Align the test data columns with the training data columns
test_data_encoded = test_data_encoded.reindex(columns=X_train.columns, fill_value=0)

# Make predictions on the testing data
predictions = model.predict(test_data_encoded)

# Print the predictions
print("Loan_Status")
for prediction in predictions:
    print(prediction)

Loan_Status
1
1
1
1
1
1
1
0
0
1
1
0
1
0
0
1
1
1
1
0
1
1
0
1
1
0
1
1
1
1
0
1
1
1
1
0
1
1
1
1
1
1
1
1
0
1
1
1
0
0
1
1
1
1
0
1
1
1
0
1
1
1
1
0
1
1
0
1
0
0
1
0
1
1
1
1
0
1
1
0
0
1
0
1
1
0
1
1
0
0
1
1
0
1
0
1
1
1
0
1
1
0
1
1
1
0
0
0
1
1
0
0
0
1
0
0
0
0
0
0
1
1
1
0
0
0
0
1
1
1
1
0
0
0
1
1
0
1
1
1
0
1
0
1
1
1
1
0
1
1
1
1
0
0
0
0
0
0
1
0
1
0
1
1
0
0
0
1
0
0
1
1
1
0
0
0
1
1
0
1
1
0
1
1
1
0
1
1
0
1
0
1
0
0
1
1
0
0
0
1
1
1
1
1
1
1
1
1
0
1
1
0
1
1
1
1
1
1
1
1
1
1
1
0
0
1
1
0
1
0
0
1
0
0
1
0
0
1
1
0
1
0
1
0
1
0
1
1
0
1
0
1
1
1
1
0
0
1
1
0
0
1
0
0
0
1
0
1
0
0
1
0
0
1
0
1
1
1
0
1
1
0
1
0
0
1
1
1
1
1
1
1
0
0
1
1
0
0
1
1
1
0
0
0
1
0
1
1
1
0
1
0
1
0
0
1
0
0
0
1
1
0
1
1
1
0
1
1
1
1
1
1
1
0
1
1
1
1
1
0
0
1
0
0
1
1
0
1
1
1
1
1
1
0
0
1
1
1
0
1
1
1
1
1
1
0
1
