# Homework 2: Hw1 but with decision trees

## Task 1

In [20]:
import numpy as np
import pandas as pd
!pip install kagglehub==0.4.1
import kagglehub
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from scipy.stats import ttest_rel
from sklearn.inspection import permutation_importance



In [2]:
# Data at https://www.kaggle.com/datasets/mlg-ulb/creditcardataraud/data
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')

Using Colab cache for faster access to the 'creditcardfraud' dataset.


In [3]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# raw class distribution
print('No Frauds', round(data['Class'].value_counts()[0] / len(data) * 100, 2), '% of the dataset')
print('Frauds', round(data['Class'].value_counts()[1] / len(data) * 100, 2), '% of the dataset')

No Frauds 99.83 % of the dataset
Frauds 0.17 % of the dataset


In [5]:
# A fast vif as compared to previous statsmodels.stats.outliers_influence import variance_inflation_factor
def fast_vif(df):
    corr = df.corr().values
    vif = np.diag(np.linalg.inv(corr))
    return pd.DataFrame({'feature': df.columns, 'VIF': vif})

vif_df = fast_vif(data)
print(vif_df)

   feature        VIF
0     Time   1.879918
1       V1   1.669207
2       V2   4.449598
3       V3   1.949665
4       V4   1.172672
5       V5   2.871870
6       V6   1.577457
7       V7   3.015964
8       V8   1.132849
9       V9   1.043109
10     V10   1.220611
11     V11   1.164665
12     V12   1.170906
13     V13   1.008529
14     V14   1.225672
15     V15   1.063474
16     V16   1.081010
17     V17   1.234457
18     V18   1.057536
19     V19   1.042558
20     V20   2.399238
21     V21   1.143026
22     V22   1.089140
23     V23   1.158154
24     V24   1.000924
25     V25   1.130838
26     V26   1.003399
27     V27   1.010661
28     V28   1.001605
29  Amount  12.120566
30   Class   2.090472


In [6]:
# scaling amount coloumn to make vars less corrolated | Amount = 11.50 !!
data['Log_Amount'] = np.log1p(data['Amount'])
scaler = StandardScaler()
data['Scaled_Amount'] = scaler.fit_transform(data['Log_Amount'].values.reshape(-1,1))
scaled_data = data.drop(['Amount','Log_Amount'], axis=1)

# check correlation
scaled_data_vif = fast_vif(scaled_data)
print(scaled_data_vif)

          feature       VIF
0            Time  1.880363
1              V1  1.061633
2              V2  1.351535
3              V3  1.413387
4              V4  1.057818
5              V5  1.206235
6              V6  1.055028
7              V7  1.102116
8              V8  1.004131
9              V9  1.030772
10            V10  1.100042
11            V11  1.169442
12            V12  1.170708
13            V13  1.008239
14            V14  1.211118
15            V15  1.071314
16            V16  1.096964
17            V17  1.233806
18            V18  1.044434
19            V19  1.004530
20            V20  1.039650
21            V21  1.019136
22            V22  1.042707
23            V23  1.006304
24            V24  1.001013
25            V25  1.102228
26            V26  1.003672
27            V27  1.003934
28            V28  1.000361
29          Class  2.089810
30  Scaled_Amount  1.644080


In [7]:
# setting up data splits
X = scaled_data.drop(['Class', 'Time'], axis = 1)
y = scaled_data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
# create and evaluate with decision tree
tree = DecisionTreeClassifier(random_state = 42, class_weight = 'balanced')
tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

In [9]:
# reporting of results
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}")
print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred)}")
print(f"Classification Report: {classification_report(y_test, y_pred)}")

Accuracy: 0.9991573329588147
F1 Score: 0.75
Confusion Matrix: [[56842    22]
 [   26    72]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.77      0.73      0.75        98

    accuracy                           1.00     56962
   macro avg       0.88      0.87      0.87     56962
weighted avg       1.00      1.00      1.00     56962



### Analysis
- low max depth: causes model to lean to biases as more data is left at the leafs. A few high level splits allow us to focus on the most important, differentiable features but does not get specific enough. 
- high max depth: leads to a model with small amounts of data in the leafs leading to overfitting. this is a high variance model with many leaf nodes. 
- min samples leaf: this prunes the model and prevents branches with not enough data in the leaf. this allows the model to generalize better. the tree will vary with deep and shallow branches. 

### Sensity Analysis
- identifies which variables drive the most change, in a decision tree these are the specific thresholds used for partitions. 
- performed selecting an achor point and wiggling the range of values for the top features of your data. we then run the edited model for each and compare the results. 
- we can use sklearn to do this

In [21]:
result = permutation_importance(tree, X_test, y_test, n_repeats=10, random_state=42)

for i in result.importances_mean.argsort()[::-1]:
    print(f"{X_test.columns[i]}: {result.importances_mean[i]:.4f}")

V14: 0.0014
V12: 0.0013
V7: 0.0009
V3: 0.0008
V4: 0.0005
V11: 0.0005
V15: 0.0004
V1: 0.0003
V18: 0.0002
V26: 0.0001
V27: 0.0001
V21: 0.0001
V10: 0.0001
V5: 0.0001
V8: 0.0001
Scaled_Amount: 0.0001
V9: 0.0001
V16: 0.0000
V25: 0.0000
V13: 0.0000
V23: 0.0000
V20: 0.0000
V24: 0.0000
V22: 0.0000
V28: 0.0000
V2: -0.0000
V6: -0.0000
V19: -0.0000
V17: -0.0000


## Task 2

In [22]:
# stratefied k fold validation

folds = [5, 10, 15]

for k in folds:
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    model = RandomForestClassifier(n_estimators=100, n_jobs=1) 
    scores = cross_val_score(model, X, y, cv=skf, n_jobs=-1)
    
    print(f"K={k:2d} | Mean Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

K= 5 | Mean Accuracy: 0.9996 (+/- 0.0001)
K=10 | Mean Accuracy: 0.9996 (+/- 0.0000)
K=15 | Mean Accuracy: 0.9996 (+/- 0.0001)


In [11]:
# reporting of results
model_rf = RandomForestClassifier(n_jobs = -1, random_state = 42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)


print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(f"F1 Score: {f1_score(y_test, y_pred_rf)}")
print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred_rf)}")
print(f"Classification Report: {classification_report(y_test, y_pred_rf)}")

Accuracy: 0.9995611109160493
F1 Score: 0.8571428571428571
Confusion Matrix: [[56862     2]
 [   23    75]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      0.77      0.86        98

    accuracy                           1.00     56962
   macro avg       0.99      0.88      0.93     56962
weighted avg       1.00      1.00      1.00     56962



In [23]:
# stratefied k fold validation XG Boost
folds = [5, 10, 15]
for k in folds:
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    model_ = XGBClassifier(
        n_estimators=100,
        tree_method='gpu_hist',
        predictor='gpu_predictor', 
        random_state=42
    )
    scores = cross_val_score(model, X_train, y_train, cv=skf, n_jobs=1)
    print(f"K={k:2d} | Mean Accuracy: {scores.mean():.4f}")

K= 5 | Mean Accuracy: 0.9995
K=10 | Mean Accuracy: 0.9995
K=15 | Mean Accuracy: 0.9996


In [13]:
# reporting of results
model_xg = XGBClassifier(tree_method = 'hist', predictor = "gpu_predictor", random_state = 42)
model_xg.fit(X_train, y_train)
y_pred_xg = model_xg.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_xg)}")
print(f"F1 Score: {f1_score(y_test, y_pred_xg)}")
print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred_xg)}")
print(f"Classification Report: {classification_report(y_test, y_pred_xg)}")

Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9995435553526912
F1 Score: 0.8571428571428571
Confusion Matrix: [[56858     6]
 [   20    78]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.93      0.80      0.86        98

    accuracy                           1.00     56962
   macro avg       0.96      0.90      0.93     56962
weighted avg       1.00      1.00      1.00     56962



### Behavior
- Accuracy is meaningless due to the class imbalance.
- F1 Score balances precision and recall: RF and XGB achieve the same but RF had better precision while XGB had better recall
- Recall: XGB had better recall which identified 78/98 frauds while RF got 75/98. 

## Task 3

In [None]:
# 5 fold for all three models
skf_5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

tree_model = DecisionTreeClassifier(max_depth=5)
tree_scores = cross_val_score(tree_model, X_train, y_train, cv=skf_5)

rf_model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=skf_5)

xgb_model = XGBClassifier(tree_method='hist', n_estimators=100)
xgb_scores = cross_val_score(xgb_model, X_train, y_train, cv=skf_5)

print(f"Tree Scores: {tree_scores}")

Tree Scores: [0.99947333 0.99934166 0.99945138 0.99947333 0.99918804]


In [19]:
# conf matrices for 5 fold stratefied samples

# tree
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)
print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred_tree)}")

# random forest
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred_rf)}")

# xgboost
xgb_model.fit(X_train, y_train)
y_pred_xg = xgb_model.predict(X_test)
print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred_xg)}")

Confusion Matrix: [[56856     8]
 [   20    78]]
Confusion Matrix: [[56862     2]
 [   20    78]]
Confusion Matrix: [[56858     6]
 [   20    78]]


In [17]:
# pairs to compare
comparisons = [
    ("Decision Tree", "Random Forest", tree_scores, rf_scores),
    ("Decision Tree", "XGBoost", tree_scores, xgb_scores),
    ("Random Forest", "XGBoost", rf_scores, xgb_scores)
]

for name1, name2, scores1, scores2 in comparisons:
    t_stat, p_val = ttest_rel(scores1, scores2)

    is_significant = "YES" if p_val < 0.05 else "NO"
    
    print(f"{name1} vs {name2}:")
    print(f"  t-statistic: {t_stat:.4f} | p-value: {p_val:.4f}")
    print(f"  Statistically Significant (p < 0.05): {is_significant}")

Decision Tree vs Random Forest:
  t-statistic: -4.5984 | p-value: 0.0100
  Statistically Significant (p < 0.05): YES
Decision Tree vs XGBoost:
  t-statistic: 1.0028 | p-value: 0.3727
  Statistically Significant (p < 0.05): NO
Random Forest vs XGBoost:
  t-statistic: 1.0106 | p-value: 0.3694
  Statistically Significant (p < 0.05): NO


### Tradeoffs
- Decision Tree: high variance + low bias; based on the conf matrix it had the most false positives.
- Random Forest: low variance + low bias; stat significantly better than decision tree. targets bias and variance by building parallel trees and aggregating results. 
- XGBoost: low bias + moderate variance; much faster runtime than the others but not stat sig better than the others. moderate variance due to trying to optimize biases of original tree model.
