 # Random Forest Model

In [1]:
#Imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [30]:
#Load Data
df = pd.read_csv(Path('creditcardfraud.csv'))
#df.head()
df.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [3]:
# Define features set
X = df.copy()
X.drop('Class', axis=1, inplace=True)
X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [4]:
# Define target vector
y = df['Class']

In [5]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
y_train.value_counts()

0    213224
1       381
Name: Class, dtype: int64

In [31]:
y_test.value_counts()

0    71091
1      111
Name: Class, dtype: int64

In [7]:
# Create StandardScaler Instance
scaler = StandardScaler()

In [8]:
# Fit StandardScaler
X_scaler = scaler.fit(X_train)

In [9]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Create Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

# Fit Random Forest model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [11]:
# Make predictions with test data
cc_fraud_predictions = rf_model.predict(X_test_scaled)

In [12]:
# Create confusion matrix
cm = confusion_matrix(y_test, cc_fraud_predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1']
)

# Calculate the accuracy score
acc_score = accuracy_score(y_test, cc_fraud_predictions)

In [13]:
# Display Results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, cc_fraud_predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,71085,6
Actual 1,26,85


Accuracy Score : 0.9995505744220669
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71091
           1       0.93      0.77      0.84       111

    accuracy                           1.00     71202
   macro avg       0.97      0.88      0.92     71202
weighted avg       1.00      1.00      1.00     71202



In [14]:
# Calculate feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1542142326532459, 'V17'),
 (0.1331104444548453, 'V12'),
 (0.1261180874654466, 'V14'),
 (0.08719503162595495, 'V10'),
 (0.07634947996548457, 'V16'),
 (0.07116964418571675, 'V11'),
 (0.03838914601017674, 'V9'),
 (0.03114828409312144, 'V18'),
 (0.0259325106557905, 'V4'),
 (0.022072096250363774, 'V21'),
 (0.019709164807185056, 'V7'),
 (0.017716696153725987, 'V26'),
 (0.015385279333688034, 'V3'),
 (0.014007517739175293, 'V6'),
 (0.013160875490985973, 'Time'),
 (0.01314083693274423, 'V2'),
 (0.01234503012077301, 'V1'),
 (0.012276408830351576, 'Amount'),
 (0.012017813235739184, 'V20'),
 (0.011417750991733948, 'V5'),
 (0.011382558542812752, 'V15'),
 (0.01131497605771804, 'V27'),
 (0.011002782964709567, 'V19'),
 (0.010267143313762932, 'V8'),
 (0.00913867734865463, 'V22'),
 (0.00905666125778152, 'V24'),
 (0.00853997424278262, 'V28'),
 (0.008423852560219826, 'V13'),
 (0.007507615104945562, 'V25'),
 (0.00648942761036403, 'V23')]

# Balanced Random Forest

In [15]:
# Import BalancedRandomForestClassifier from imblearn
from imblearn.ensemble import BalancedRandomForestClassifier

In [16]:
# Create a BalancedRandomForestClassifier instance
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

In [17]:
# Fit the model to the training data
brf_model.fit(X_train_scaled, y_train)



In [18]:
# Predict fraud for testing features
y_prediction = brf_model.predict(X_test_scaled)

In [19]:
# Print performance stats
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     71091
           1       0.07      0.86      0.14       111

    accuracy                           0.98     71202
   macro avg       0.54      0.92      0.56     71202
weighted avg       1.00      0.98      0.99     71202



# SMOTE Oversampling

In [20]:
# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE

In [21]:
# Instantiate the SMOTE model instance
smote_sampler = SMOTE(random_state=1, sampling_strategy='auto')

In [22]:
# Fit the SMOTE model to the training data
X_resampled, y_resampled = smote_sampler.fit_resample(X_train, y_train)

In [27]:
y_resampled.value_counts()

0    213224
1    213224
Name: Class, dtype: int64

In [34]:
# Fit the RandomForestClassifier on the resampled data
smote_rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
smote_rf_model.fit(X_resampled, y_resampled)

In [38]:
# Generate predictions based on the resampled data model, should this use X_test_scaled data?
smote_rf_model_predictions = smote_rf_model.predict(X_test)

In [39]:
# Print confusion matrix and classification report for smote model
confusion_matrix(y_test, smote_rf_model_predictions)

array([[71077,    14],
       [   24,    87]], dtype=int64)

In [40]:
print(classification_report(y_test, smote_rf_model_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71091
           1       0.86      0.78      0.82       111

    accuracy                           1.00     71202
   macro avg       0.93      0.89      0.91     71202
weighted avg       1.00      1.00      1.00     71202



## Adjusted Training Data Balance to 40% Fraud

In [75]:
# Split the data into majority ('0') and minority ('1') classes
X_majority = X[y == 0]
X_minority = X[y == 1]
y_minority = y[y == 1]

In [80]:
y_minority.value_counts()

1    492
Name: Class, dtype: int64

In [76]:
# Determine the desired percentage balance (e.g., 40% minority class)
desired_balance_percentage = 0.4

In [81]:
# Calculate the current percentage of the minority class
current_balance_percentage = len(y_minority) / len(y)
print(current_balance_percentage)

0.001727485630620034


In [82]:
# Determine the number of additional samples needed to achieve the desired balance
num_samples_needed = int(len(X_majority) * desired_balance_percentage / current_balance_percentage) - len(X_minority)
print(num_samples_needed)

65832761


In [83]:
# Apply SMOTE to generate synthetic samples for the minority class
adjusted_smote = SMOTE(sampling_strategy=desired_balance_percentage, random_state=42, k_neighbors=1)
X_resampled, y_resampled = adjusted_smote.fit_resample(X_minority, y_minority)

ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

In [None]:
# Combine the resampled minority class with the majority class
X_balanced = pd.concat([X_majority, X_resampled])
y_balanced = pd.concat([pd.Series([0] * len(X_majority)), y_resampled])

# Random Forest w/Trimmed DataFrame

In [57]:
# Trim original dataframe to top 10 variables based on feature importance
#trimmed_df = df[['V17','V12','V14','V10','V16','V11','V9','V18','V4','V21','Class']]
trimmed_df = df[['V17','V12','V14','V10','V16','Class']]
trimmed_df.head()

Unnamed: 0,V17,V12,V14,V10,V16,Class
0,0.207971,-0.617801,-0.311169,0.090794,-0.470401,0
1,-0.114805,1.065235,-0.143772,-0.166974,0.463917,0
2,1.109969,0.066084,-0.165946,0.207643,-2.890083,0
3,-0.684093,0.178228,-0.287924,-0.054952,-1.059647,0
4,-0.237033,0.538196,-1.11967,0.753074,-0.451449,0


In [58]:
# Define features set
X_trimmed = trimmed_df.copy()
X_trimmed.drop('Class', axis=1, inplace=True)
X_trimmed.head()

Unnamed: 0,V17,V12,V14,V10,V16
0,0.207971,-0.617801,-0.311169,0.090794,-0.470401
1,-0.114805,1.065235,-0.143772,-0.166974,0.463917
2,1.109969,0.066084,-0.165946,0.207643,-2.890083
3,-0.684093,0.178228,-0.287924,-0.054952,-1.059647
4,-0.237033,0.538196,-1.11967,0.753074,-0.451449


In [59]:
# Define target vector
y_trimmed = trimmed_df['Class']

In [60]:
X_trimmed_train, X_trimmed_test, y_trimmed_train, y_trimmed_test = train_test_split(X, y, random_state=1)
y_trimmed_train.value_counts()

0    213224
1       381
Name: Class, dtype: int64

In [61]:
y_trimmed_test.value_counts()

0    71091
1      111
Name: Class, dtype: int64

In [62]:
# Fit StandardScaler
X_trimmed_scaler = scaler.fit(X_trimmed_train)

In [63]:
# Scaling data
X_trimmed_train_scaled = X_trimmed_scaler.transform(X_trimmed_train)
X_trimmed_test_scaled = X_trimmed_scaler.transform(X_trimmed_test)

In [64]:
# Create Random Forest model
trimmed_rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

# Fit Random Forest model
trimmed_rf_model = trimmed_rf_model.fit(X_trimmed_train_scaled, y_trimmed_train)

In [65]:
# Make predictions with test data
trimmed_cc_fraud_predictions = trimmed_rf_model.predict(X_trimmed_test_scaled)

In [66]:
# Create confusion matrix
trimmed_cm = confusion_matrix(y_trimmed_test, trimmed_cc_fraud_predictions)
trimmed_cm_df = pd.DataFrame(
    trimmed_cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1']
)

# Calculate the accuracy score
acc_score = accuracy_score(y_trimmed_test, trimmed_cc_fraud_predictions)

In [67]:
# Display Results
print("Confusion Matrix")
display(trimmed_cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_trimmed_test, trimmed_cc_fraud_predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,71085,6
Actual 1,26,85


Accuracy Score : 0.9995505744220669
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71091
           1       0.93      0.77      0.84       111

    accuracy                           1.00     71202
   macro avg       0.97      0.88      0.92     71202
weighted avg       1.00      1.00      1.00     71202



# SMOTE w/Trimmed DataFrame

In [73]:
# Fit the SMOTE model to the training data
X_trimmed_resampled, y_trimmed_resampled = smote_sampler.fit_resample(X_trimmed_train, y_trimmed_train)
y_trimmed_resampled.value_counts()

0    213224
1    213224
Name: Class, dtype: int64

In [69]:
# Fit the RandomForestClassifier on the resampled data
smote_trimmed_rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
smote_trimmed_rf_model.fit(X_trimmed_resampled, y_trimmed_resampled)

In [70]:
# Generate predictions based on the resampled data model, should this use X_test_scaled data?
smote_trimmed_rf_model_predictions = smote_trimmed_rf_model.predict(X_trimmed_test)

In [71]:
print(classification_report(y_trimmed_test, smote_trimmed_rf_model_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71091
           1       0.86      0.78      0.82       111

    accuracy                           1.00     71202
   macro avg       0.93      0.89      0.91     71202
weighted avg       1.00      1.00      1.00     71202



# Overbalance Training Data to 40/60