In [11]:
import pandas as pd
import xgboost
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

## **Step 1: Load the Data**

In [13]:
merged_df = pd.read_csv("merged_data.csv")

## **Step 2: Split Features and Target**

In [15]:
merged_df['fully_funded'] = merged_df['fully_funded'].replace({'t': 1, 'f': 0})
X = merged_df.drop(columns=['projectid', 'fully_funded'])
y = merged_df['fully_funded']

## **Step 3: Set up data for modeling**

In [17]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [18]:
X_train = pd.get_dummies(X_train_raw, drop_first=True)
X_test = pd.get_dummies(X_test_raw, drop_first=True)

In [19]:
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [20]:
def clean_column_names(df):
    df.columns = (
        df.columns.astype(str)
        .str.replace('[', '(', regex=False)
        .str.replace(']', ')', regex=False)
        .str.replace('<', 'less_than_', regex=False)
        .str.replace('>', 'greater_than_', regex=False)
    )
    return df

X_train = clean_column_names(X_train)
X_test = clean_column_names(X_test)

## **Step 4: List models and sampling strategies**

In [22]:
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42)
}

In [23]:
sampling_strategies = {
    'SMOTE (Oversampling)': SMOTE(random_state=42),
    'RUS (Undersampling)': RandomUnderSampler(random_state=42)
}

## **Step 5: Run Models with train test split**

In [25]:
for sampling_name, sampler in sampling_strategies.items():
    print(f"\n=== Sampling Strategy: {sampling_name} ===")
    
    # Apply sampling to training set
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    
    for model_name, model in models.items():
        print(f"\n--- Model: {model_name} ---")
        
        # Train model
        model.fit(X_resampled, y_resampled)
        
        # Predict
        y_pred_test = model.predict(X_test)
        y_pred_train = model.predict(X_resampled)
        
        # Evaluation
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
        print("\nClassification Report:\n", classification_report(y_test, y_pred_test))
        print(f"Train Accuracy: {accuracy_score(y_resampled, y_pred_train):.4f}")
        print(f"Test Accuracy:  {accuracy_score(y_test, y_pred_test):.4f}")


=== Sampling Strategy: SMOTE (Oversampling) ===

--- Model: Random Forest ---
Confusion Matrix:
 [[1244 2018]
 [ 994 9635]]

Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.38      0.45      3262
           1       0.83      0.91      0.86     10629

    accuracy                           0.78     13891
   macro avg       0.69      0.64      0.66     13891
weighted avg       0.76      0.78      0.77     13891

Train Accuracy: 0.9983
Test Accuracy:  0.7832

--- Model: Decision Tree ---
Confusion Matrix:
 [[1414 1848]
 [1749 8880]]

Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.43      0.44      3262
           1       0.83      0.84      0.83     10629

    accuracy                           0.74     13891
   macro avg       0.64      0.63      0.64     13891
weighted avg       0.74      0.74      0.74     13891

Train Accuracy: 0.9983
Test Accuracy:  0.7411

--

In [26]:
X_raw = merged_df.drop(columns=['projectid', 'fully_funded'])
y = merged_df['fully_funded']
X_encoded = pd.get_dummies(X_raw, drop_first=True)
X_encoded = clean_column_names(X_encoded)


## **Step 6: Run models using k-fold cross validation**

In [28]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {}

for sampling_name, sampler in sampling_strategies.items():
    print(f"\n=== Sampling Strategy: {sampling_name} ===")
    results[sampling_name] = {}
    
    for model_name, model in models.items():
        print(f"\n--- Model: {model_name} ---")
        
        # Pipeline: sampler + classifier
        pipeline = Pipeline([
            ('sampler', sampler),
            ('model', model)
        ])

        y_pred = cross_val_predict(pipeline, X_encoded, y, cv=cv)
        acc = accuracy_score(y, y_pred)
        cm = confusion_matrix(y, y_pred)
        cr = classification_report(y, y_pred)

        print("Confusion Matrix:\n", cm)
        print("\nClassification Report:\n", cr)
        print(f"CV Accuracy: {acc:.4f}")

        results[sampling_name][model_name] = {
            'accuracy': acc,
            'confusion_matrix': cm,
            'classification_report': classification_report(y, y_pred, output_dict=True)
        }


=== Sampling Strategy: SMOTE (Oversampling) ===

--- Model: Random Forest ---
Confusion Matrix:
 [[ 6229 10082]
 [ 4859 48282]]

Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.38      0.45     16311
           1       0.83      0.91      0.87     53141

    accuracy                           0.78     69452
   macro avg       0.69      0.65      0.66     69452
weighted avg       0.76      0.78      0.77     69452

CV Accuracy: 0.7849

--- Model: Decision Tree ---
Confusion Matrix:
 [[ 7054  9257]
 [ 8832 44309]]

Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.43      0.44     16311
           1       0.83      0.83      0.83     53141

    accuracy                           0.74     69452
   macro avg       0.64      0.63      0.63     69452
weighted avg       0.74      0.74      0.74     69452

CV Accuracy: 0.7395

--- Model: XGBoost ---
Confusion Matrix:
 [[ 6