In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [4]:
# Sample dataset
data = {
    'SME Scale': ['Small', 'Medium', 'Large', 'Small', 'Medium'],
    'Annual Revenue': [50000, 200000, 1000000, 60000, 250000],
    'Industry': ['Retail', 'Technology', 'Retail', 'Technology', 'Retail'],
    'Asset Value': [100000, 500000, 2000000, 110000, 550000],
    'Number of Employees': [10, 50, 200, 12, 55],
    'Expenses': [30000, 150000, 800000, 32000, 160000],
    'Profitability': [20000, 50000, 200000, 28000, 90000],
    'Cash Flow': [15000, 40000, 150000, 18000, 70000],
    'Current Loan': [10000, 50000, 200000, 12000, 60000],
    'Credit Score': [700, 750, 800, 710, 760],
    'CreditAccess': [1, 1, 1, 0, 0]  # Example binary target variable
}

df = pd.DataFrame(data)

In [7]:
# Prepare features and target variable
X = df.drop(columns=['CreditAccess'])  # Features
y = df['CreditAccess']  # Target variable

In [8]:
# Convert categorical features to numerical (if any)
X = pd.get_dummies(X, drop_first=True)  # Convert categorical variables


In [14]:
# Adjust test size based on dataset size
# Here we need at least 2 instances in test set to stratify correctly
test_size = 0.4 if len(y) > 5 else 0.1  # 40% if we have more than 5 samples, else 10%

In [15]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

In [16]:
# Check the class distribution in the training set
print("\nClass distribution in the training set:")
print(y_train.value_counts())


Class distribution in the training set:
CreditAccess
0    2
1    2
Name: count, dtype: int64


In [17]:

# Initialize SMOTE
smote = SMOTE(random_state=42)

In [18]:
# Fit and resample the training data
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [19]:

# Check the new class distribution
print("\nNew class distribution after SMOTE:")
print(pd.Series(y_resampled).value_counts())


New class distribution after SMOTE:
CreditAccess
0    2
1    2
Name: count, dtype: int64


In [21]:
# Train the Logistic Regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_resampled, y_resampled)

In [22]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [25]:
# After making predictions on the test set
y_pred = model.predict(X_test)

In [26]:
# Get unique classes from the target variable
labels = np.unique(y)

In [27]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred, labels=labels))

Confusion Matrix:
[[0 0]
 [0 1]]


In [29]:
print(y.value_counts())


CreditAccess
1    3
0    2
Name: count, dtype: int64


In [30]:
X_train

Unnamed: 0,Annual Revenue,Asset Value,Number of Employees,Expenses,Profitability,Cash Flow,Current Loan,Credit Score,SME Scale_Medium,SME Scale_Small,Industry_Technology
4,250000,550000,55,160000,90000,70000,60000,760,True,False,False
2,1000000,2000000,200,800000,200000,150000,200000,800,False,False,False
0,50000,100000,10,30000,20000,15000,10000,700,False,True,False
3,60000,110000,12,32000,28000,18000,12000,710,False,True,True


In [31]:
X_test

Unnamed: 0,Annual Revenue,Asset Value,Number of Employees,Expenses,Profitability,Cash Flow,Current Loan,Credit Score,SME Scale_Medium,SME Scale_Small,Industry_Technology
1,200000,500000,50,150000,50000,40000,50000,750,True,False,True


In [32]:
print(y.value_counts())


CreditAccess
1    3
0    2
Name: count, dtype: int64


In [33]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)


In [34]:
# Adjust test_size to ensure you have enough samples for both classes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [35]:
print(classification_report(y_test, y_pred, labels=labels, zero_division=0))


ValueError: Found input variables with inconsistent numbers of samples: [2, 1]

In [36]:
print(f"y_test shape: {y_test.shape}")
print(f"y_pred shape: {y_pred.shape}")


y_test shape: (2,)
y_pred shape: (1,)


In [37]:
print(f"Predictions: {np.unique(y_pred)}")


Predictions: [1]


In [38]:
print("Training class distribution:")
print(y_train.value_counts())
print("Testing class distribution:")
print(y_test.value_counts())


Training class distribution:
CreditAccess
1    2
0    1
Name: count, dtype: int64
Testing class distribution:
CreditAccess
1    1
0    1
Name: count, dtype: int64


In [39]:
# Make predictions
y_pred = model.predict(X_test)


In [40]:
# Make predictions
y_pred = model.predict(X_test)

# Check the shapes of y_test and y_pred
print(f"y_test shape: {y_test.shape}")
print(f"y_pred shape: {y_pred.shape}")

# Check unique predictions
print(f"Predictions: {np.unique(y_pred)}")

# Evaluate the model if the lengths are consistent
if len(y_test) == len(y_pred):
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred, labels=np.unique(y)))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, labels=np.unique(y), zero_division=0))
else:
    print("Inconsistent lengths: Cannot evaluate model.")


y_test shape: (2,)
y_pred shape: (2,)
Predictions: [0 1]
Confusion Matrix:
[[1 0]
 [0 1]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

