In [None]:
# Install scikit-learn
!pip install scikit-learn



In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [None]:
# Load dataset from Google Drive
df = pd.read_csv('/content/drive/My Drive/Project4/df_final.csv')

In [None]:
# Define target and features
target_column = 'Credit_Score'  # Replace with your actual target column
X = df.drop(target_column, axis=1)
y = df[target_column]

In [None]:
# Handle categorical variables
X = pd.get_dummies(X)
y = y.astype('category').cat.codes  # Convert multi-class target to single label format


In [None]:
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

In [None]:
# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)



In [None]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = {i : class_weights[i] for i in range(len(class_weights))}



In [None]:
# Define the Logistic Regression model
model = LogisticRegression(max_iter=1000, class_weight=class_weights, random_state=42, solver='lbfgs', multi_class='multinomial')



In [None]:
# Train the model
model.fit(X_train, y_train)

In [None]:
# Validate the model on the validation set
y_val_pred = model.predict(X_val)
print("Validation Set Results:")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))

Validation Set Results:
Accuracy: 0.7069557539487884
Confusion Matrix:
 [[3083   71  412]
 [ 597 2455  681]
 [ 699  527 1668]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.86      0.78      3566
           1       0.80      0.66      0.72      3733
           2       0.60      0.58      0.59      2894

    accuracy                           0.71     10193
   macro avg       0.70      0.70      0.70     10193
weighted avg       0.71      0.71      0.70     10193



In [None]:
# Test the model on the test set
y_test_pred = model.predict(X_test)
print("Test Set Results:")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))

Test Set Results:
Accuracy: 0.6991072304522712
Confusion Matrix:
 [[3090   93  434]
 [ 616 2415  668]
 [ 737  519 1621]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.85      0.77      3617
           1       0.80      0.65      0.72      3699
           2       0.60      0.56      0.58      2877

    accuracy                           0.70     10193
   macro avg       0.70      0.69      0.69     10193
weighted avg       0.70      0.70      0.70     10193



In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'solver': ['lbfgs', 'saga'],
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [1000, 2000, 3000]
}

grid_search = GridSearchCV(LogisticRegression(class_weight=class_weights, random_state=42, multi_class='multinomial'),
                           param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Validate the best model
y_val_pred_best = best_model.predict(X_val)
print("Validation Set Results with Best Model:")
print("Accuracy:", accuracy_score(y_val, y_val_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_best))
print("Classification Report:\n", classification_report(y_val, y_val_pred_best))

# Test the best model
y_test_pred_best = best_model.predict(X_test)
print("Test Set Results with Best Model:")
print("Accuracy:", accuracy_score(y_test, y_test_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_best))
print("Classification Report:\n", classification_report(y_test, y_test_pred_best))


Validation Set Results with Best Model:
Accuracy: 0.7069557539487884
Confusion Matrix:
 [[3082   71  413]
 [ 596 2456  681]
 [ 699  527 1668]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.86      0.78      3566
           1       0.80      0.66      0.72      3733
           2       0.60      0.58      0.59      2894

    accuracy                           0.71     10193
   macro avg       0.70      0.70      0.70     10193
weighted avg       0.71      0.71      0.70     10193

Test Set Results with Best Model:
Accuracy: 0.6991072304522712
Confusion Matrix:
 [[3089   93  435]
 [ 616 2415  668]
 [ 736  519 1622]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.85      0.77      3617
           1       0.80      0.65      0.72      3699
           2       0.60      0.56      0.58      2877

    accuracy                           0.70     10193
   macro avg       

In [None]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'solver': ['lbfgs', 'saga'],
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [1000, 2000, 3000]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(LogisticRegression(class_weight=class_weights, random_state=42, multi_class='multinomial'),
                           param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Validate the best model
y_val_pred_best = best_model.predict(X_val)
print("Validation Set Results with Best Model:")
print("Accuracy:", accuracy_score(y_val, y_val_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_best))
print("Classification Report:\n", classification_report(y_val, y_val_pred_best))

# Test the best model
y_test_pred_best = best_model.predict(X_test)
print("Test Set Results with Best Model:")
print("Accuracy:", accuracy_score(y_test, y_test_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_best))
print("Classification Report:\n", classification_report(y_test, y_test_pred_best))


Validation Set Results with Best Model:
Accuracy: 0.7069557539487884
Confusion Matrix:
 [[3082   71  413]
 [ 596 2456  681]
 [ 699  527 1668]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.86      0.78      3566
           1       0.80      0.66      0.72      3733
           2       0.60      0.58      0.59      2894

    accuracy                           0.71     10193
   macro avg       0.70      0.70      0.70     10193
weighted avg       0.71      0.71      0.70     10193

Test Set Results with Best Model:
Accuracy: 0.6991072304522712
Confusion Matrix:
 [[3089   93  435]
 [ 616 2415  668]
 [ 736  519 1622]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.85      0.77      3617
           1       0.80      0.65      0.72      3699
           2       0.60      0.56      0.58      2877

    accuracy                           0.70     10193
   macro avg       