In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Assuming you have your full dataset stored in df
df = pd.read_csv(r"C:\Users\gurja\Downloads\Cleaned_and_Organized_SBAnational.csv")

# Columns: BankState, NAICS, ApprovalFY, Term, NoEmp, NewExist, CreateJob, RetainedJob, FranchiseCode, UrbanRural, RevLineCr, LowDoc, DisbursementGross, GrAppv, SBA_Appv, Target

# Convert categorical variables to numerical format using label encoding
label_encoder = LabelEncoder()
categorical_columns = ['BankState', 'UrbanRural', 'RevLineCr', 'LowDoc']

for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Convert monetary columns to numerical format
monetary_columns = ['DisbursementGross', 'GrAppv', 'SBA_Appv']
for col in monetary_columns:
    df[col] = df[col].replace('[\$,]', '', regex=True).astype(float)

# Extract features and target variable
X = df.drop('Target', axis=1)
y = df['Target']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Selection
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Feature Importance
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_})
print(feature_importance.sort_values(by='Importance', ascending=False))


Accuracy: 0.9462612534962993
              precision    recall  f1-score   support

           0       0.96      0.98      0.97    148458
           1       0.89      0.79      0.84     31375

    accuracy                           0.95    179833
   macro avg       0.92      0.88      0.90    179833
weighted avg       0.94      0.95      0.94    179833

[[145470   2988]
 [  6676  24699]]
              Feature  Importance
3                Term    0.441502
2          ApprovalFY    0.102880
0           BankState    0.067954
14           SBA_Appv    0.059817
12  DisbursementGross    0.056848
1               NAICS    0.054755
13             GrAppv    0.048440
4               NoEmp    0.037264
7         RetainedJob    0.029662
8       FranchiseCode    0.028225
10          RevLineCr    0.022640
9          UrbanRural    0.019117
6           CreateJob    0.018067
5            NewExist    0.009179
11             LowDoc    0.003652


In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Assuming you have your full dataset stored in df
df = pd.read_csv(r"C:\Users\gurja\Downloads\Cleaned_and_Organized_SBAnational.csv")

# Columns: BankState, NAICS, ApprovalFY, Term, NoEmp, NewExist, CreateJob, RetainedJob, FranchiseCode, UrbanRural, RevLineCr, LowDoc, DisbursementGross, GrAppv, SBA_Appv, Target

# Convert categorical variables to numerical format using label encoding
label_encoder = LabelEncoder()
categorical_columns = ['BankState', 'UrbanRural', 'RevLineCr', 'LowDoc']

for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Convert monetary columns to numerical format
monetary_columns = ['DisbursementGross', 'GrAppv', 'SBA_Appv']
for col in monetary_columns:
    df[col] = df[col].replace('[\$,]', '', regex=True).astype(float)

# Extract features and target variable
X = df.drop('Target', axis=1)
y = df['Target']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Selection
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Example of a new data point for prediction
new_data_point = {
    'BankState': 'OH',
    'NAICS': 451120,
    'ApprovalFY': 1997,
    'Term': 84,
    'NoEmp': 4,
    'NewExist': 2,
    'CreateJob': 0,
    'RetainedJob': 0,
    'FranchiseCode': 1,
    'UrbanRural': 'N',
    'RevLineCr': 'Y',
    'LowDoc': 'N',
    'DisbursementGross': 60000,
    'GrAppv': 60000,
    'SBA_Appv': 48000
}

# Convert the new data point to the same format as the training data
new_df = pd.DataFrame(index=[0])
for col in categorical_columns:
    new_df[col] = label_encoder.transform([new_data_point[col]])

for col in monetary_columns:
    new_df[col] = new_data_point[col]

# Make predictions using the trained model
prediction = model.predict(new_df)

print(f"The predicted outcome for the new data point is: {prediction}")



ValueError: y contains previously unseen labels: 'OH'