In [2]:
import pandas as pd
from pymongo import MongoClient

# Load your CSV data into a pandas DataFrame
df = pd.read_csv('credit_risk_data.csv')

# Convert DataFrame to list of dictionaries
records = df.to_dict('records')

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')

# Access (or automatically create) your database and collection
db = client['credit_db']
collection = db['credit_data']

# Insert all records
collection.insert_many(records)

print(f"Inserted {len(records)} records into MongoDB.")


Inserted 10 records into MongoDB.


In [3]:
from pymongo import MongoClient
import pandas as pd

def load_data_from_mongodb(
    uri="mongodb://localhost:27017/",
    database="credit_db",
    collection="credit_data",
    query={}
):
    client = MongoClient(uri)
    db = client[database]
    coll = db[collection]
    cursor = coll.find(query)
    data = list(cursor)
    for record in data:
        record.pop('_id', None)
    return pd.DataFrame(data)

if __name__ == "__main__":
    df = load_data_from_mongodb()
    df.to_csv("mongo_data.csv", index=False)


In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

def feature_selection(input_csv, output_csv, target):
    df = pd.read_csv(input_csv)
    
    # Separate features and target
    X = df.drop(target, axis=1)
    y = df[target]
    
    # Convert categorical columns to one-hot encoding
    X_encoded = pd.get_dummies(X)
    
    # Fit RandomForest
    model = RandomForestClassifier(random_state=42)
    model.fit(X_encoded, y)
    
    # Get feature importances
    importances = model.feature_importances_
    
    # Select features with importance > 0.02
    selected_features = X_encoded.columns[importances > 0.02].tolist()
    
    # Since X_encoded has dummies, but original df does not,
    # select those columns from encoded X and add target
    selected_df = pd.concat([X_encoded[selected_features], y], axis=1)
    
    # Save selected features + target to output CSV
    selected_df.to_csv(output_csv, index=False)
    print(f"Selected features saved to {output_csv}: {selected_features}")

if __name__ == "__main__":
    feature_selection("mongo_data.csv", "selected.csv", target='default')


Selected features saved to selected.csv: ['age', 'credit_amount', 'duration_months', 'job_management', 'job_skilled', 'job_unskilled', 'purpose_business', 'purpose_car', 'purpose_education', 'purpose_house']


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import pickle
import json

def train(input_csv, model_path, metrics_path, target='default'):
    df = pd.read_csv(input_csv)
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model = XGBClassifier().fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    pickle.dump(model, open(model_path, "wb"))
    metrics = {"accuracy": acc}
    with open(metrics_path, "w") as f:
        json.dump(metrics, f)
    pd.DataFrame({"truth": y_test, "pred": model.predict(X_test)}).to_csv("predictions.csv", index=False)

if __name__ == "__main__":
    train("selected.csv", "model.pkl", "metrics.json")


In [None]:
from flask import Flask, request, jsonify
import pickle
import pandas as pd

app = Flask(__name__)
model = pickle.load(open("model.pkl", "rb"))

@app.route("/predict", methods=["POST"])
def predict():
    data = request.get_json()
    df = pd.DataFrame([data])
    pred = model.predict(df)[0]
    return jsonify({"prediction": int(pred)})

if __name__ == "__main__":
    import os
    # Disable the Flask debugger's auto-reloader when running in Jupyter/IPython
    debug = True
    use_reloader = False
    app.run(debug=debug, use_reloader=use_reloader)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
