 # Random Forest Model

In [1]:
#Imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
#Load Data
df = pd.read_csv(Path('creditcardfraud.csv'))
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# Define features set
X = df.copy()
X.drop('Class', axis=1, inplace=True)
X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [4]:
# Define target vector
y = df['Class'].ravel()

In [5]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Create StandardScaler Instance
scaler = StandardScaler()

In [7]:
# Fit StandardScaler
X_scaler = scaler.fit(X_train)

In [8]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Create Random Forest model
rf_model = RandomForestClassifier(n_estimators=5, random_state=1)

# Fit Random Forest model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
# Make predictions with test data
cc_fraud_predictions = rf_model.predict(X_test_scaled)

In [11]:
# Create confusion matrix
cm = confusion_matrix(y_test, cc_fraud_predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1']
)

# Calculate the accuracy score
acc_score = accuracy_score(y_test, cc_fraud_predictions)

In [12]:
# Display Results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, cc_fraud_predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,71076,15
Actual 1,30,81


Accuracy Score : 0.9993679952810315
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71091
           1       0.84      0.73      0.78       111

    accuracy                           1.00     71202
   macro avg       0.92      0.86      0.89     71202
weighted avg       1.00      1.00      1.00     71202



In [13]:
# Calculate feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.29098519450775656, 'V17'),
 (0.1287871226421524, 'V12'),
 (0.11578149123048817, 'V14'),
 (0.0695522646298754, 'V9'),
 (0.06827658977455, 'V10'),
 (0.024022312152724906, 'V26'),
 (0.02353991385517993, 'V16'),
 (0.022291168142830407, 'V1'),
 (0.020458039663118342, 'V2'),
 (0.017109183776133442, 'V8'),
 (0.016876601085390422, 'V18'),
 (0.01678683988811679, 'V11'),
 (0.015880773861699803, 'V4'),
 (0.01513067398554059, 'Time'),
 (0.014911081284578586, 'V23'),
 (0.014208394663397178, 'V19'),
 (0.013776902327591689, 'V28'),
 (0.01159274609275132, 'V20'),
 (0.01115709330959681, 'V15'),
 (0.010792487740489987, 'Amount'),
 (0.010117827457020546, 'V6'),
 (0.009357872228487944, 'V22'),
 (0.008928548847452257, 'V21'),
 (0.008714413927700362, 'V13'),
 (0.008205965917729997, 'V27'),
 (0.007863768742274308, 'V7'),
 (0.00696403528472881, 'V5'),
 (0.00663951187223878, 'V25'),
 (0.0063259502725610695, 'V3'),
 (0.004965230835843202, 'V24')]

# Balanced Random Forest

In [14]:
# Import BalancedRandomForestClassifier from imblearn
from imblearn.ensemble import BalancedRandomForestClassifier

In [15]:
# Create a BalancedRandomForestClassifier instance
brf_model = BalancedRandomForestClassifier()

In [16]:
# Fit the model to the training data
brf_model.fit(X_train_scaled, y_train)



In [17]:
# Predict fraud for testing features
y_prediction = brf_model.predict(X_test_scaled)

In [18]:
# Print performance stats
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     71091
           1       0.08      0.86      0.15       111

    accuracy                           0.98     71202
   macro avg       0.54      0.92      0.57     71202
weighted avg       1.00      0.98      0.99     71202



# SMOTE Oversampling

In [19]:
# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE

In [20]:
# Instantiate the SMOTE model instance
smote_sampler = SMOTE(random_state=1, sampling_strategy='auto')

In [21]:
# Fit the SMOTE model to the training data
X_resampled, y_resampled = smote_sampler.fit_resample(X_train, y_train)

In [22]:
# Fit the RandomForestClassifier on the resampled data
smote_rf_model = RandomForestClassifier(n_estimators=5, random_state=1)
smote_rf_model.fit(X_resampled, y_resampled)

In [23]:
# Generate predictions based on the resampled data model
smote_rf_model_predictions = smote_rf_model.predict(X_test)

In [28]:
# Print confusion matrix and classification report for smote model
confusion_matrix(y_test, smote_rf_model_predictions)

array([[71067,    24],
       [   23,    88]], dtype=int64)

In [29]:
print(classification_report(y_test, smote_rf_model_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71091
           1       0.79      0.79      0.79       111

    accuracy                           1.00     71202
   macro avg       0.89      0.90      0.89     71202
weighted avg       1.00      1.00      1.00     71202

