In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for seaborn
sns.set(style="whitegrid")

# Load the dataset using pandas
data = pd.read_csv('creditcard.csv')

# Display the first few rows of the dataset
data.head()

# Handle missing values
data.fillna(data.mean(), inplace=True)

# Split the dataset into features and target
X = data.drop('Class', axis=1)
y = data['Class']

# Split the dataset into training and testing sets BEFORE applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Balance the training dataset using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

# Display the shapes of the training and testing sets
X_train_scaled.shape, X_test_scaled.shape, y_train_balanced.shape, y_test.shape

# Use GridSearchCV to find the optimal number of neighbors
param_grid = {'n_neighbors': range(1, 21)}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train_balanced)

# Get the best number of neighbors
best_n_neighbors = grid_search.best_params_['n_neighbors']
print(f"Best n_neighbors: {best_n_neighbors}")

# Train KNN Model with the optimal number of neighbors
knn_model = KNeighborsClassifier(n_neighbors=best_n_neighbors)
knn_model.fit(X_train_scaled, y_train_balanced)

# Make predictions on the test set
y_pred = knn_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# Plot the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Collect user input for each feature
new_data = []
for feature in X.columns:
    value = float(input(f"Enter value for {feature}: "))
    new_data.append(value)

# Convert the new data to a numpy array
new_data = np.array([new_data])

# Scale the new data
new_data_scaled = scaler.transform(new_data)

# Make predictions
predictions = knn_model.predict(new_data_scaled)

# Map predictions to labels
prediction_labels = ["Fraud" if pred == 1 else "Not Fraud" for pred in predictions]

# Print the predictions
print("Predictions for the new data:", prediction_labels)

# Visualize the results
plt.figure(figsize=(10, 7))
plt.scatter(range(len(prediction_labels)), prediction_labels, color='blue', label='Predictions')
plt.xlabel('Sample Index')
plt.ylabel('Predicted Class')
plt.title('Predictions on New Data')
plt.legend()
plt.show()

KeyboardInterrupt: 