In [22]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r'C:\Users\Riya\Downloads\credits_data.csv')

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,ID,Income,Credit History,Loan Amount,Repayment Behavior,Creditworthiness
0,1,102764,Good,20924,On Time,Yes
1,2,120614,Bad,16252,Late,No
2,3,73850,Good,35691,On Time,No
3,4,86512,Bad,39131,Late,Yes
4,5,42589,Good,39989,On Time,Yes


In [23]:
# Handle missing values
# Fill missing values in numerical columns with the median
numerical_cols = ['Income', 'Loan Amount']
df[numerical_cols] = df[numerical_cols].apply(lambda x: x.fillna(x.median()), axis=0)

# Fill missing values in categorical columns with the mode
categorical_cols = ['Credit History', 'Repayment Behavior', 'Creditworthiness']
df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]), axis=0)

# Display the first few rows of the DataFrame to check for missing values
df.head()


Unnamed: 0,ID,Income,Credit History,Loan Amount,Repayment Behavior,Creditworthiness
0,1,102764,Good,20924,On Time,Yes
1,2,120614,Bad,16252,Late,No
2,3,73850,Good,35691,On Time,No
3,4,86512,Bad,39131,Late,Yes
4,5,42589,Good,39989,On Time,Yes


In [24]:
# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Display the first few rows of the DataFrame
df_encoded.head()


Unnamed: 0,ID,Income,Loan Amount,Credit History_Good,Repayment Behavior_On Time,Creditworthiness_Yes
0,1,102764,20924,1,1,1
1,2,120614,16252,0,0,0
2,3,73850,35691,1,1,0
3,4,86512,39131,0,0,1
4,5,42589,39989,1,1,1


In [25]:
from sklearn.preprocessing import MinMaxScaler

# Normalize numerical features
scaler = MinMaxScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# Display the first few rows of the DataFrame
df_encoded.head()


Unnamed: 0,ID,Income,Loan Amount,Credit History_Good,Repayment Behavior_On Time,Creditworthiness_Yes
0,1,0.642336,0.261899,1,1,1
1,2,0.790258,0.143046,0,0,0
2,3,0.402728,0.637564,1,1,0
3,4,0.507657,0.725076,0,0,1
4,5,0.14367,0.746903,1,1,1


In [26]:
from sklearn.model_selection import train_test_split

# Define X (features) and y (target)
X = df_encoded.drop(['ID', 'Creditworthiness_Yes'], axis=1)  # Drop ID and one of the encoded columns for target
y = df_encoded['Creditworthiness_Yes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (40, 4)
Shape of X_test: (10, 4)
Shape of y_train: (40,)
Shape of y_test: (10,)


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# Initialize the logistic regression model
model = LogisticRegression(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the testing data
y_pred = model.predict(X_test)

# Calculate accuracy and ROC-AUC score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("ROC-AUC Score:", roc_auc)


Accuracy: 1.0
ROC-AUC Score: 1.0
