In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import joblib

# Load datasets
train_df = pd.read_csv("Downloads/archive (1)/train.csv")
test_df = pd.read_csv("Downloads/archive (1)/test.csv")

# Select relevant numeric features and target
features = [
    'Annual_Income',
    'Monthly_Inhand_Salary',
    'Num_Bank_Accounts',
    'Outstanding_Debt',
    'Credit_Utilization_Ratio'
]
target = 'Credit_Score'

# Clean feature columns (remove non-numeric characters)
for col in features:
    train_df[col] = pd.to_numeric(train_df[col].astype(str).str.replace('[^0-9.]', '', regex=True), errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col].astype(str).str.replace('[^0-9.]', '', regex=True), errors='coerce')

# Drop rows with missing values
X_train = train_df[features].dropna()
y_train = train_df.loc[X_train.index, target]

X_test = test_df[features].dropna()
y_test = test_df.loc[X_test.index, target] if target in test_df.columns else None

# Scale the numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = RandomForestClassifier(random_state=42, class_weight="balanced")
model.fit(X_train_scaled, y_train)
preds = model.predict(X_test_scaled)
print(pd.Series(preds).value_counts())

# Save model and scaler
joblib.dump(model, "credit_score_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("✅ Model and scaler saved successfully!")


  train_df = pd.read_csv("Downloads/archive (1)/train.csv")


Standard    22714
Poor        12386
Good         7402
Name: count, dtype: int64
✅ Model and scaler saved successfully!
