# Homework 2: Hw1 but with decision trees

## Task 1

In [26]:
import numpy as np
import pandas as pd
import kagglehub
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [19]:
# Data at https://www.kaggle.com/datasets/mlg-ulb/creditcardataraud/data
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')

Using Colab cache for faster access to the 'creditcardfraud' dataset.


In [20]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [21]:
# raw class distribution
print('No Frauds', round(data['Class'].value_counts()[0] / len(data) * 100, 2), '% of the dataset')
print('Frauds', round(data['Class'].value_counts()[1] / len(data) * 100, 2), '% of the dataset')

No Frauds 99.83 % of the dataset
Frauds 0.17 % of the dataset


In [22]:
# A fast vif as compared to previous statsmodels.stats.outliers_influence import variance_inflation_factor
def fast_vif(df):
    corr = df.corr().values
    vif = np.diag(np.linalg.inv(corr))
    return pd.DataFrame({'feature': df.columns, 'VIF': vif})

vif_df = fast_vif(data)
print(vif_df)

   feature        VIF
0     Time   1.879918
1       V1   1.669207
2       V2   4.449598
3       V3   1.949665
4       V4   1.172672
5       V5   2.871870
6       V6   1.577457
7       V7   3.015964
8       V8   1.132849
9       V9   1.043109
10     V10   1.220611
11     V11   1.164665
12     V12   1.170906
13     V13   1.008529
14     V14   1.225672
15     V15   1.063474
16     V16   1.081010
17     V17   1.234457
18     V18   1.057536
19     V19   1.042558
20     V20   2.399238
21     V21   1.143026
22     V22   1.089140
23     V23   1.158154
24     V24   1.000924
25     V25   1.130838
26     V26   1.003399
27     V27   1.010661
28     V28   1.001605
29  Amount  12.120566
30   Class   2.090472


In [25]:
# scaling amount coloumn to make vars less corrolated | Amount = 11.50 !!
data['Log_Amount'] = np.log1p(data['Amount'])
scaler = StandardScaler()
data['Scaled_Amount'] = scaler.fit_transform(data['Log_Amount'].values.reshape(-1,1))
scaled_data = data.drop(['Amount','Log_Amount'], axis=1)

# check correlation
scaled_data_vif = fast_vif(scaled_data)
print(scaled_data_vif)

          feature       VIF
0            Time  1.880363
1              V1  1.061633
2              V2  1.351535
3              V3  1.413387
4              V4  1.057818
5              V5  1.206235
6              V6  1.055028
7              V7  1.102116
8              V8  1.004131
9              V9  1.030772
10            V10  1.100042
11            V11  1.169442
12            V12  1.170708
13            V13  1.008239
14            V14  1.211118
15            V15  1.071314
16            V16  1.096964
17            V17  1.233806
18            V18  1.044434
19            V19  1.004530
20            V20  1.039650
21            V21  1.019136
22            V22  1.042707
23            V23  1.006304
24            V24  1.001013
25            V25  1.102228
26            V26  1.003672
27            V27  1.003934
28            V28  1.000361
29          Class  2.089810
30  Scaled_Amount  1.644080


In [28]:
# setting up data splits
X = scaled_data.drop(['Class', 'Time'], axis = 1)
y = scaled_data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [30]:
# create and evaluate with decision tree
tree = DecisionTreeClassifier(random_state = 42, class_weight = 'balanced')
tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

In [None]:
# reporting of results
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}")
print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred)}")
print(f"Classification Report: {classification_report(y_test, y_pred)}")

Accuracy: 0.9991573329588147
F1 Score: 0.75
Confusion Matrix: [[56842    22]
 [   26    72]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.77      0.73      0.75        98

    accuracy                           1.00     56962
   macro avg       0.88      0.87      0.87     56962
weighted avg       1.00      1.00      1.00     56962

