In [None]:
import pandas as pd            
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns                  
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE  
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.preprocessing import StandardScaler  
from sklearn.pipeline import make_pipeline  
from sklearn.model_selection import GridSearchCV  
from sklearn.decomposition import PCA
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz
from sklearn.linear_model import LogisticRegression  
import scipy.stats as stats

In [None]:
df = pd.read_csv('creditcard.csv')

In [None]:
df.columns

In [None]:
df.head()

In [None]:
plt.figure(figsize=(15,8))
d = df.corr()['Class'][:-1].abs().sort_values().plot(kind='bar', title='Most correlated features')

plt.show()

In [None]:
sns.jointplot(x='V17', y='V14',hue='Class', data=df, palette = 'dark')


In [None]:
import pandas as pd
import plotly.express as px

fraud_cases = df[df['Class'] == 1]
non_fraud_cases = df[df['Class'] == 0]

num_non_fraud = int(len(non_fraud_cases) * 0.20)

non_fraud_sample = non_fraud_cases.sample(n=num_non_fraud, random_state=42)

df_new = pd.concat([fraud_cases, non_fraud_sample], axis=0)

df_new.reset_index(drop=True, inplace=True)

X_top3 = df_new[['V17', 'V14', 'V12']]

fig = px.scatter_3d(X_top3, x='V17', y='V14', z='V12', color=df_new['Class'], color_discrete_sequence=['red', 'green'])

fig.update_layout(
    scene=dict(
        xaxis_title='V17',
        yaxis_title='V14',
        zaxis_title='V12',
    ),
    width=800,
    height=600,
    title='Interactive 3D Scatter Plot of Top 3 Principal Components (20% Non-Fraud Cases)'
)

fig.show()

In [None]:
fraud_cases = df[df['Class'] == 1]
non_fraud_cases = df[df['Class'] == 0]

num_non_fraud = int(len(non_fraud_cases) * 0.20)

non_fraud_sample = non_fraud_cases.sample(n=num_non_fraud, random_state=42)

df_new = pd.concat([fraud_cases, non_fraud_sample], axis=0)

df_new.reset_index(drop=True, inplace=True)

X_top3 = df_new[['V25', 'V15', 'V13']]

fig = px.scatter_3d(X_top3, x='V25', y='V15', z='V13', color=df_new['Class'], color_discrete_sequence=['red', 'green'])

fig.update_layout(
    scene=dict(
        xaxis_title='V25',
        yaxis_title='V15',
        zaxis_title='V13',
    ),
    width=800,
    height=600,
    title='Interactive 3D Scatter Plot of Bottom 3 Principal Components (20% Non-Fraud Cases)'
)

fig.show()

In [None]:
from imblearn.over_sampling import SMOTE

fraud_cases = df[df['Class'] == 1]
non_fraud_cases = df[df['Class'] == 0]

num_non_fraud = int(len(non_fraud_cases) * 0.20)

non_fraud_sample = non_fraud_cases.sample(n=num_non_fraud, random_state=42)

df_new = pd.concat([fraud_cases, non_fraud_sample], axis=0)

df_new.reset_index(drop=True, inplace=True)

X = df_new[['V25', 'V15', 'V13']]
y = df_new['Class']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

df_resampled = pd.concat([X_resampled, y_resampled], axis=1)

print("Class Distribution:")
print(df_resampled['Class'].value_counts())

fig = px.scatter_3d(df_resampled, x='V25', y='V15', z='V13', color='Class', color_discrete_sequence=['red', 'green'])

fig.update_layout(
    scene=dict(
        xaxis_title='V25',
        yaxis_title='V15',
        zaxis_title='V13',
    ),
    width=800,
    height=600,
    title='Interactive 3D Scatter Plot of Bottom 3 Principal Components (SMOTE Applied)'
)

fig.show()

In [None]:
df = df.copy()

df.drop_duplicates(inplace=True)
print("Duplicated values dropped succesfully")

In [None]:
df = df.drop('Time', axis=1)


In [None]:
numeric_columns = (list(df.loc[:, 'V1':'Amount']))


In [None]:
def boxplots_custom(dataset, columns_list, rows, cols, suptitle):
    fig, axs = plt.subplots(rows, cols, sharey=True, figsize=(16,25))
    fig.suptitle(suptitle,y=1, size=25)
    axs = axs.flatten()
    for i, data in enumerate(columns_list):
        sns.boxplot(data=dataset[data], orient='h', ax=axs[i])
        axs[i].set_title(data + ', skewness is: '+str(round(dataset[data].skew(axis = 0, skipna = True),2)))
        
boxplots_custom(dataset=df, columns_list=numeric_columns, rows=10, cols=3, suptitle='Boxplots for each variable')
plt.tight_layout()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming you have already loaded the dataset into a DataFrame called 'df'

# Separate the features (X) and the target variable (y)
X = df.drop('Class', axis=1)
y = df['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

# Calculate the precision-recall curve and AUC-PRC
y_prob = dt_classifier.predict_proba(X_test)[:, 1]  # Probability of the positive class
precision, recall, _ = precision_recall_curve(y_test, y_prob)
auc_prc = auc(recall, precision)
print("AUC-PRC:", auc_prc)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, auc

# Separate the features (X) and the target variable (y)
X = df.drop('Class', axis=1)
y = df['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for Random Search CV
param_grid = {
    'n_estimators': [100],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt']
}

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform Random Search CV
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_grid, n_iter=1, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Get the best model
best_rf_model = random_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_rf_model.predict(X_test)

# Calculate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the F1 score of the best model
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

# Calculate the precision-recall curve and AUC-PRC of the best model
y_prob = best_rf_model.predict_proba(X_test)[:, 1]  # Probability of the positive class
precision, recall, _ = precision_recall_curve(y_test, y_prob)
auc_prc = auc(recall, precision)
print("AUC-PRC:", auc_prc)


rf_prob = y_prob 
rf_pred = y_pred

In [None]:


# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X, y)
importances = rf.feature_importances_

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

importances = best_rf_model.feature_importances_

feature_names = X.columns

imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

imp_df = imp_df.sort_values('Importance', ascending=True)

plt.figure(figsize=(12, 8))
plt.bar(imp_df['Feature'], imp_df['Importance'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Features')
plt.ylabel('Gini Importance')
plt.title('Feature Importance based on Gini Impurity')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, auc

# Separate the features (X) and the target variable (y)
X = df.drop('Class', axis=1)
y = df['Class']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Reshape the data for LSTM input
X_reshaped = X_scaled.reshape(X_scaled.shape[0], 1, X_scaled.shape[1])

# Split the data into training and testing sets
train_size = int(len(X_reshaped) * 0.8)
X_train, X_test = X_reshaped[:train_size], X_reshaped[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Create the LSTM model
model = Sequential()
model.add(LSTM(64, input_shape=(1, X_scaled.shape[1])))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1)

# Make predictions on the test set
y_prob = model.predict(X_test)
y_pred = (y_prob > 0.5).astype(int)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the F1 score of the model
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

# Calculate the precision-recall curve and AUC-PRC of the model
precision, recall, _ = precision_recall_curve(y_test, y_prob)
auc_prc = auc(recall, precision)
print("AUC-PRC:", auc_prc)

lstm_prob = y_prob 
lstm_pred = y_pred

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, auc

# Combine the predictions using majority voting
ensemble_pred = np.round((lstm_pred + rf_pred) / 2).astype(int)

# Calculate the accuracy of the ensemble model
accuracy = accuracy_score(y_test, ensemble_pred)
print("Ensemble Accuracy:", accuracy)

# Calculate the F1 score of the ensemble model
f1 = f1_score(y_test, ensemble_pred)
print("Ensemble F1 score:", f1)

# Calculate the precision-recall curve and AUC-PRC of the ensemble model
ensemble_prob = (lstm_prob.ravel() + rf_prob.ravel()) / 2
precision, recall, _ = precision_recall_curve(y_test, ensemble_prob)
auc_prc = auc(recall, precision)
print("Ensemble AUC-PRC:", auc_prc)

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

dt_classifier = DecisionTreeClassifier(random_state=42)

dt_classifier.fit(X_train_resampled, y_train_resampled)

y_pred = dt_classifier.predict(X_test)

f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

y_prob = dt_classifier.predict_proba(X_test)[:, 1]  
precision, recall, _ = precision_recall_curve(y_test, y_prob)
auc_prc = auc(recall, precision)
print("AUC-PRC:", auc_prc)

plt.figure(figsize=(20, 10))
plot_tree(dt_classifier, filled=True, rounded=True, class_names=["0", "1"], feature_names=X.columns)
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE

# Assuming you have already loaded the dataset into a DataFrame called 'df'

# Separate the features (X) and the target variable (y)
X = df.drop('Class', axis=1)
y = df['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a variable to control the use of SMOTE
use_smote = True

# Apply SMOTE to the training data if use_smote is True
if use_smote:
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

# Define the parameter grid for Random Search
param_grid = {
    'n_estimators': [100],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7],
    'subsample': [0.5, 0.7, 1.0],
    'max_features': ['sqrt', 'log2', None]
}

# Create a Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Perform Random Search
random_search = RandomizedSearchCV(estimator=gb_classifier, param_distributions=param_grid, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Get the best model
best_gb_model = random_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_gb_model.predict(X_test)

# Calculate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the F1 score of the best model
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

# Calculate the precision-recall curve and AUC-PRC of the best model
y_prob = best_gb_model.predict_proba(X_test)[:, 1]  # Probability of the positive class
precision, recall, _ = precision_recall_curve(y_test, y_prob)
auc_prc = auc(recall, precision)
print("AUC-PRC:", auc_prc)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, auc

# Assuming you have already loaded the dataset into a DataFrame called 'df'

# Separate the features (X) and the target variable (y)
X = df.drop('Class', axis=1)
y = df['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for Random Search
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7],
    'subsample': [0.5, 0.7, 1.0],
    'max_features': ['sqrt', 'log2', None]
}

# Create a Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Perform Random Search
random_search = RandomizedSearchCV(estimator=gb_classifier, param_distributions=param_grid, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Get the best model
best_gb_model = random_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_gb_model.predict(X_test)

# Calculate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the F1 score of the best model
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

# Calculate the precision-recall curve and AUC-PRC of the best model
y_prob = best_gb_model.predict_proba(X_test)[:, 1]  # Probability of the positive class
precision, recall, _ = precision_recall_curve(y_test, y_prob)
auc_prc = auc(recall, precision)
print("AUC-PRC:", auc_prc)